1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "./vpx_dsp_rtcd.h"
12#include "vpx_dsp/x86/inv_txfm_sse2.h"
13#include "vpx_dsp/x86/transpose_sse2.h"
14#include "vpx_dsp/x86/txfm_common_sse2.h"
15
16void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
17                             int stride) {
18  const __m128i eight = _mm_set1_epi16(8);
19  __m128i in[2];
20
21  // Rows
22  in[0] = load_input_data(input);
23  in[1] = load_input_data(input + 8);
24  idct4_sse2(in);
25
26  // Columns
27  idct4_sse2(in);
28
29  // Final round and shift
30  in[0] = _mm_add_epi16(in[0], eight);
31  in[1] = _mm_add_epi16(in[1], eight);
32  in[0] = _mm_srai_epi16(in[0], 4);
33  in[1] = _mm_srai_epi16(in[1], 4);
34
35  recon_and_store4x4_sse2(in, dest, stride);
36}
37
38void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
39                            int stride) {
40  const __m128i zero = _mm_setzero_si128();
41  int a;
42  __m128i dc_value, d[2];
43
44  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
45  a = (int)dct_const_round_shift(a * cospi_16_64);
46  a = ROUND_POWER_OF_TWO(a, 4);
47
48  dc_value = _mm_set1_epi16(a);
49
50  // Reconstruction and Store
51  d[0] = _mm_cvtsi32_si128(*(const int *)(dest));
52  d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
53  d[0] = _mm_unpacklo_epi32(d[0],
54                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
55  d[1] = _mm_unpacklo_epi32(
56      _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]);
57  d[0] = _mm_unpacklo_epi8(d[0], zero);
58  d[1] = _mm_unpacklo_epi8(d[1], zero);
59  d[0] = _mm_add_epi16(d[0], dc_value);
60  d[1] = _mm_add_epi16(d[1], dc_value);
61  d[0] = _mm_packus_epi16(d[0], d[1]);
62
63  *(int *)dest = _mm_cvtsi128_si32(d[0]);
64  d[0] = _mm_srli_si128(d[0], 4);
65  *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]);
66  d[0] = _mm_srli_si128(d[0], 4);
67  *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]);
68  d[0] = _mm_srli_si128(d[0], 4);
69  *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]);
70}
71
72void idct4_sse2(__m128i *in) {
73  const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
74  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
75  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
76  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
77  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
78  __m128i u[8], v[8];
79
80  transpose_16bit_4x4(in);
81  // stage 1
82  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
83  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
84  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
85  v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
86  v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
87  v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
88
89  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
90  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
91  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
92  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
93
94  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
95  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
96  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
97  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
98
99  u[0] = _mm_packs_epi32(v[0], v[1]);
100  u[1] = _mm_packs_epi32(v[3], v[2]);
101
102  // stage 2
103  in[0] = _mm_add_epi16(u[0], u[1]);
104  in[1] = _mm_sub_epi16(u[0], u[1]);
105  in[1] = _mm_shuffle_epi32(in[1], 0x4E);
106}
107
108void iadst4_sse2(__m128i *in) {
109  const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
110  const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
111  const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
112  const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
113  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
114  const __m128i kZero = _mm_set1_epi16(0);
115  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
116  __m128i u[8], v[8], in7;
117
118  transpose_16bit_4x4(in);
119  in7 = _mm_srli_si128(in[1], 8);
120  in7 = _mm_add_epi16(in7, in[0]);
121  in7 = _mm_sub_epi16(in7, in[1]);
122
123  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
124  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
125  u[2] = _mm_unpacklo_epi16(in7, kZero);
126  u[3] = _mm_unpackhi_epi16(in[0], kZero);
127
128  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
129  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
130  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
131  v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
132  v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
133  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
134
135  u[0] = _mm_add_epi32(v[0], v[1]);
136  u[1] = _mm_add_epi32(v[3], v[4]);
137  u[2] = v[2];
138  u[3] = _mm_add_epi32(u[0], u[1]);
139  u[4] = _mm_slli_epi32(v[5], 2);
140  u[5] = _mm_add_epi32(u[3], v[5]);
141  u[6] = _mm_sub_epi32(u[5], u[4]);
142
143  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
144  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
145  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
146  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
147
148  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
149  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
150  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
151  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
152
153  in[0] = _mm_packs_epi32(u[0], u[1]);
154  in[1] = _mm_packs_epi32(u[2], u[3]);
155}
156
157#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
158  {                                                                  \
159    tmp0 = _mm_madd_epi16(lo_0, cst0);                               \
160    tmp1 = _mm_madd_epi16(hi_0, cst0);                               \
161    tmp2 = _mm_madd_epi16(lo_0, cst1);                               \
162    tmp3 = _mm_madd_epi16(hi_0, cst1);                               \
163                                                                     \
164    tmp0 = _mm_add_epi32(tmp0, rounding);                            \
165    tmp1 = _mm_add_epi32(tmp1, rounding);                            \
166    tmp2 = _mm_add_epi32(tmp2, rounding);                            \
167    tmp3 = _mm_add_epi32(tmp3, rounding);                            \
168                                                                     \
169    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                     \
170    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                     \
171    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                     \
172    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                     \
173                                                                     \
174    res0 = _mm_packs_epi32(tmp0, tmp1);                              \
175    res1 = _mm_packs_epi32(tmp2, tmp3);                              \
176  }
177
178#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \
179              out4, out5, out6, out7)                                         \
180  {                                                                           \
181    /* Stage1 */                                                              \
182    {                                                                         \
183      const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7);                     \
184      const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7);                     \
185      const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5);                     \
186      const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5);                     \
187                                                                              \
188      MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1,      \
189                             stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6)  \
190    }                                                                         \
191                                                                              \
192    /* Stage2 */                                                              \
193    {                                                                         \
194      const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4);                     \
195      const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4);                     \
196      const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6);                     \
197      const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6);                     \
198                                                                              \
199      MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1,      \
200                             stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3)  \
201                                                                              \
202      stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                 \
203      stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                 \
204      stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                 \
205      stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                 \
206    }                                                                         \
207                                                                              \
208    /* Stage3 */                                                              \
209    {                                                                         \
210      const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5);               \
211      const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5);               \
212                                                                              \
213      stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                 \
214      stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                 \
215      stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                 \
216      stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                 \
217                                                                              \
218      tmp0 = _mm_madd_epi16(lo_56, stg2_1);                                   \
219      tmp1 = _mm_madd_epi16(hi_56, stg2_1);                                   \
220      tmp2 = _mm_madd_epi16(lo_56, stg2_0);                                   \
221      tmp3 = _mm_madd_epi16(hi_56, stg2_0);                                   \
222                                                                              \
223      tmp0 = _mm_add_epi32(tmp0, rounding);                                   \
224      tmp1 = _mm_add_epi32(tmp1, rounding);                                   \
225      tmp2 = _mm_add_epi32(tmp2, rounding);                                   \
226      tmp3 = _mm_add_epi32(tmp3, rounding);                                   \
227                                                                              \
228      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                            \
229      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                            \
230      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                            \
231      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                            \
232                                                                              \
233      stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                   \
234      stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                   \
235    }                                                                         \
236                                                                              \
237    /* Stage4  */                                                             \
238    out0 = _mm_add_epi16(stp1_0, stp2_7);                                     \
239    out1 = _mm_add_epi16(stp1_1, stp1_6);                                     \
240    out2 = _mm_add_epi16(stp1_2, stp1_5);                                     \
241    out3 = _mm_add_epi16(stp1_3, stp2_4);                                     \
242    out4 = _mm_sub_epi16(stp1_3, stp2_4);                                     \
243    out5 = _mm_sub_epi16(stp1_2, stp1_5);                                     \
244    out6 = _mm_sub_epi16(stp1_1, stp1_6);                                     \
245    out7 = _mm_sub_epi16(stp1_0, stp2_7);                                     \
246  }
247
248void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
249                             int stride) {
250  const __m128i zero = _mm_setzero_si128();
251  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
252  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
253  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
254  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
255  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
256  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
257  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
258  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
259  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
260  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
261
262  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
263  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
264  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
265  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
266  int i;
267
268  // Load input data.
269  in0 = load_input_data(input);
270  in1 = load_input_data(input + 8 * 1);
271  in2 = load_input_data(input + 8 * 2);
272  in3 = load_input_data(input + 8 * 3);
273  in4 = load_input_data(input + 8 * 4);
274  in5 = load_input_data(input + 8 * 5);
275  in6 = load_input_data(input + 8 * 6);
276  in7 = load_input_data(input + 8 * 7);
277
278  // 2-D
279  for (i = 0; i < 2; i++) {
280    // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
281    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
282                  in4, in5, in6, in7);
283
284    // 4-stage 1D idct8x8
285    IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5,
286          in6, in7);
287  }
288
289  // Final rounding and shift
290  in0 = _mm_adds_epi16(in0, final_rounding);
291  in1 = _mm_adds_epi16(in1, final_rounding);
292  in2 = _mm_adds_epi16(in2, final_rounding);
293  in3 = _mm_adds_epi16(in3, final_rounding);
294  in4 = _mm_adds_epi16(in4, final_rounding);
295  in5 = _mm_adds_epi16(in5, final_rounding);
296  in6 = _mm_adds_epi16(in6, final_rounding);
297  in7 = _mm_adds_epi16(in7, final_rounding);
298
299  in0 = _mm_srai_epi16(in0, 5);
300  in1 = _mm_srai_epi16(in1, 5);
301  in2 = _mm_srai_epi16(in2, 5);
302  in3 = _mm_srai_epi16(in3, 5);
303  in4 = _mm_srai_epi16(in4, 5);
304  in5 = _mm_srai_epi16(in5, 5);
305  in6 = _mm_srai_epi16(in6, 5);
306  in7 = _mm_srai_epi16(in7, 5);
307
308  RECON_AND_STORE(dest + 0 * stride, in0);
309  RECON_AND_STORE(dest + 1 * stride, in1);
310  RECON_AND_STORE(dest + 2 * stride, in2);
311  RECON_AND_STORE(dest + 3 * stride, in3);
312  RECON_AND_STORE(dest + 4 * stride, in4);
313  RECON_AND_STORE(dest + 5 * stride, in5);
314  RECON_AND_STORE(dest + 6 * stride, in6);
315  RECON_AND_STORE(dest + 7 * stride, in7);
316}
317
318void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
319                            int stride) {
320  __m128i dc_value;
321  const __m128i zero = _mm_setzero_si128();
322  int a;
323
324  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
325  a = (int)dct_const_round_shift(a * cospi_16_64);
326  a = ROUND_POWER_OF_TWO(a, 5);
327
328  dc_value = _mm_set1_epi16(a);
329
330  RECON_AND_STORE(dest + 0 * stride, dc_value);
331  RECON_AND_STORE(dest + 1 * stride, dc_value);
332  RECON_AND_STORE(dest + 2 * stride, dc_value);
333  RECON_AND_STORE(dest + 3 * stride, dc_value);
334  RECON_AND_STORE(dest + 4 * stride, dc_value);
335  RECON_AND_STORE(dest + 5 * stride, dc_value);
336  RECON_AND_STORE(dest + 6 * stride, dc_value);
337  RECON_AND_STORE(dest + 7 * stride, dc_value);
338}
339
340void idct8_sse2(__m128i *in) {
341  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
342  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
343  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
344  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
345  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
346  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
347  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
348  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
349  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
350
351  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
352  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
353  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
354  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
355
356  // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
357  TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0,
358                in1, in2, in3, in4, in5, in6, in7);
359
360  // 4-stage 1D idct8x8
361  IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3],
362        in[4], in[5], in[6], in[7]);
363}
364
365void iadst8_sse2(__m128i *in) {
366  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
367  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
368  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
369  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
370  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
371  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
372  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
373  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
374  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
375  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
376  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
377  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
378  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
379  const __m128i k__const_0 = _mm_set1_epi16(0);
380  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
381
382  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
383  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
384  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
385  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
386  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
387
388  // transpose
389  array_transpose_8x8(in, in);
390
391  // properly aligned for butterfly input
392  in0 = in[7];
393  in1 = in[0];
394  in2 = in[5];
395  in3 = in[2];
396  in4 = in[3];
397  in5 = in[4];
398  in6 = in[1];
399  in7 = in[6];
400
401  // column transformation
402  // stage 1
403  // interleave and multiply/add into 32-bit integer
404  s0 = _mm_unpacklo_epi16(in0, in1);
405  s1 = _mm_unpackhi_epi16(in0, in1);
406  s2 = _mm_unpacklo_epi16(in2, in3);
407  s3 = _mm_unpackhi_epi16(in2, in3);
408  s4 = _mm_unpacklo_epi16(in4, in5);
409  s5 = _mm_unpackhi_epi16(in4, in5);
410  s6 = _mm_unpacklo_epi16(in6, in7);
411  s7 = _mm_unpackhi_epi16(in6, in7);
412
413  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
414  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
415  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
416  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
417  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
418  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
419  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
420  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
421  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
422  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
423  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
424  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
425  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
426  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
427  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
428  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
429
430  // addition
431  w0 = _mm_add_epi32(u0, u8);
432  w1 = _mm_add_epi32(u1, u9);
433  w2 = _mm_add_epi32(u2, u10);
434  w3 = _mm_add_epi32(u3, u11);
435  w4 = _mm_add_epi32(u4, u12);
436  w5 = _mm_add_epi32(u5, u13);
437  w6 = _mm_add_epi32(u6, u14);
438  w7 = _mm_add_epi32(u7, u15);
439  w8 = _mm_sub_epi32(u0, u8);
440  w9 = _mm_sub_epi32(u1, u9);
441  w10 = _mm_sub_epi32(u2, u10);
442  w11 = _mm_sub_epi32(u3, u11);
443  w12 = _mm_sub_epi32(u4, u12);
444  w13 = _mm_sub_epi32(u5, u13);
445  w14 = _mm_sub_epi32(u6, u14);
446  w15 = _mm_sub_epi32(u7, u15);
447
448  // shift and rounding
449  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
450  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
451  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
452  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
453  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
454  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
455  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
456  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
457  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
458  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
459  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
460  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
461  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
462  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
463  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
464  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
465
466  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
467  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
468  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
469  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
470  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
471  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
472  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
473  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
474  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
475  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
476  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
477  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
478  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
479  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
480  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
481  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
482
483  // back to 16-bit and pack 8 integers into __m128i
484  in[0] = _mm_packs_epi32(u0, u1);
485  in[1] = _mm_packs_epi32(u2, u3);
486  in[2] = _mm_packs_epi32(u4, u5);
487  in[3] = _mm_packs_epi32(u6, u7);
488  in[4] = _mm_packs_epi32(u8, u9);
489  in[5] = _mm_packs_epi32(u10, u11);
490  in[6] = _mm_packs_epi32(u12, u13);
491  in[7] = _mm_packs_epi32(u14, u15);
492
493  // stage 2
494  s0 = _mm_add_epi16(in[0], in[2]);
495  s1 = _mm_add_epi16(in[1], in[3]);
496  s2 = _mm_sub_epi16(in[0], in[2]);
497  s3 = _mm_sub_epi16(in[1], in[3]);
498  u0 = _mm_unpacklo_epi16(in[4], in[5]);
499  u1 = _mm_unpackhi_epi16(in[4], in[5]);
500  u2 = _mm_unpacklo_epi16(in[6], in[7]);
501  u3 = _mm_unpackhi_epi16(in[6], in[7]);
502
503  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
504  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
505  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
506  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
507  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
508  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
509  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
510  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
511
512  w0 = _mm_add_epi32(v0, v4);
513  w1 = _mm_add_epi32(v1, v5);
514  w2 = _mm_add_epi32(v2, v6);
515  w3 = _mm_add_epi32(v3, v7);
516  w4 = _mm_sub_epi32(v0, v4);
517  w5 = _mm_sub_epi32(v1, v5);
518  w6 = _mm_sub_epi32(v2, v6);
519  w7 = _mm_sub_epi32(v3, v7);
520
521  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
522  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
523  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
524  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
525  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
526  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
527  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
528  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
529
530  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
531  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
532  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
533  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
534  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
535  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
536  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
537  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
538
539  // back to 16-bit intergers
540  s4 = _mm_packs_epi32(u0, u1);
541  s5 = _mm_packs_epi32(u2, u3);
542  s6 = _mm_packs_epi32(u4, u5);
543  s7 = _mm_packs_epi32(u6, u7);
544
545  // stage 3
546  u0 = _mm_unpacklo_epi16(s2, s3);
547  u1 = _mm_unpackhi_epi16(s2, s3);
548  u2 = _mm_unpacklo_epi16(s6, s7);
549  u3 = _mm_unpackhi_epi16(s6, s7);
550
551  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
552  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
553  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
554  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
555  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
556  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
557  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
558  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
559
560  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
561  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
562  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
563  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
564  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
565  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
566  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
567  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
568
569  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
570  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
571  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
572  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
573  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
574  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
575  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
576  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
577
578  s2 = _mm_packs_epi32(v0, v1);
579  s3 = _mm_packs_epi32(v2, v3);
580  s6 = _mm_packs_epi32(v4, v5);
581  s7 = _mm_packs_epi32(v6, v7);
582
583  in[0] = s0;
584  in[1] = _mm_sub_epi16(k__const_0, s4);
585  in[2] = s6;
586  in[3] = _mm_sub_epi16(k__const_0, s2);
587  in[4] = s3;
588  in[5] = _mm_sub_epi16(k__const_0, s7);
589  in[6] = s5;
590  in[7] = _mm_sub_epi16(k__const_0, s1);
591}
592
593void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
594                             int stride) {
595  const __m128i zero = _mm_setzero_si128();
596  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
597  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
598  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
599  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
600  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
601  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
602  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
603  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
604  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
605  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
606  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
607
608  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
609  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
610  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
611  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
612
613  // Rows. Load 4-row input data.
614  in0 = load_input_data(input);
615  in1 = load_input_data(input + 8 * 1);
616  in2 = load_input_data(input + 8 * 2);
617  in3 = load_input_data(input + 8 * 3);
618
619  // 8x4 Transpose
620  TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
621  // Stage1
622  {
623    const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
624    const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
625
626    tmp0 = _mm_madd_epi16(lo_17, stg1_0);
627    tmp2 = _mm_madd_epi16(lo_17, stg1_1);
628    tmp4 = _mm_madd_epi16(lo_35, stg1_2);
629    tmp6 = _mm_madd_epi16(lo_35, stg1_3);
630
631    tmp0 = _mm_add_epi32(tmp0, rounding);
632    tmp2 = _mm_add_epi32(tmp2, rounding);
633    tmp4 = _mm_add_epi32(tmp4, rounding);
634    tmp6 = _mm_add_epi32(tmp6, rounding);
635    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
636    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
637    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
638    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
639
640    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
641    stp1_5 = _mm_packs_epi32(tmp4, tmp6);
642  }
643
644  // Stage2
645  {
646    const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
647    const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
648
649    tmp0 = _mm_madd_epi16(lo_04, stg2_0);
650    tmp2 = _mm_madd_epi16(lo_04, stg2_1);
651    tmp4 = _mm_madd_epi16(lo_26, stg2_2);
652    tmp6 = _mm_madd_epi16(lo_26, stg2_3);
653
654    tmp0 = _mm_add_epi32(tmp0, rounding);
655    tmp2 = _mm_add_epi32(tmp2, rounding);
656    tmp4 = _mm_add_epi32(tmp4, rounding);
657    tmp6 = _mm_add_epi32(tmp6, rounding);
658    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
659    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
660    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
661    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
662
663    stp2_0 = _mm_packs_epi32(tmp0, tmp2);
664    stp2_2 = _mm_packs_epi32(tmp6, tmp4);
665
666    tmp0 = _mm_add_epi16(stp1_4, stp1_5);
667    tmp1 = _mm_sub_epi16(stp1_4, stp1_5);
668
669    stp2_4 = tmp0;
670    stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
671    stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
672  }
673
674  // Stage3
675  {
676    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
677
678    tmp4 = _mm_add_epi16(stp2_0, stp2_2);
679    tmp6 = _mm_sub_epi16(stp2_0, stp2_2);
680
681    stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
682    stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
683
684    tmp0 = _mm_madd_epi16(lo_56, stg3_0);
685    tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
686
687    tmp0 = _mm_add_epi32(tmp0, rounding);
688    tmp2 = _mm_add_epi32(tmp2, rounding);
689    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
690    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
691
692    stp1_5 = _mm_packs_epi32(tmp0, tmp2);
693  }
694
695  // Stage4
696  tmp0 = _mm_add_epi16(stp1_3, stp2_4);
697  tmp1 = _mm_add_epi16(stp1_2, stp1_5);
698  tmp2 = _mm_sub_epi16(stp1_3, stp2_4);
699  tmp3 = _mm_sub_epi16(stp1_2, stp1_5);
700
701  TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
702
703  IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4,
704        in5, in6, in7);
705  // Final rounding and shift
706  in0 = _mm_adds_epi16(in0, final_rounding);
707  in1 = _mm_adds_epi16(in1, final_rounding);
708  in2 = _mm_adds_epi16(in2, final_rounding);
709  in3 = _mm_adds_epi16(in3, final_rounding);
710  in4 = _mm_adds_epi16(in4, final_rounding);
711  in5 = _mm_adds_epi16(in5, final_rounding);
712  in6 = _mm_adds_epi16(in6, final_rounding);
713  in7 = _mm_adds_epi16(in7, final_rounding);
714
715  in0 = _mm_srai_epi16(in0, 5);
716  in1 = _mm_srai_epi16(in1, 5);
717  in2 = _mm_srai_epi16(in2, 5);
718  in3 = _mm_srai_epi16(in3, 5);
719  in4 = _mm_srai_epi16(in4, 5);
720  in5 = _mm_srai_epi16(in5, 5);
721  in6 = _mm_srai_epi16(in6, 5);
722  in7 = _mm_srai_epi16(in7, 5);
723
724  RECON_AND_STORE(dest + 0 * stride, in0);
725  RECON_AND_STORE(dest + 1 * stride, in1);
726  RECON_AND_STORE(dest + 2 * stride, in2);
727  RECON_AND_STORE(dest + 3 * stride, in3);
728  RECON_AND_STORE(dest + 4 * stride, in4);
729  RECON_AND_STORE(dest + 5 * stride, in5);
730  RECON_AND_STORE(dest + 6 * stride, in6);
731  RECON_AND_STORE(dest + 7 * stride, in7);
732}
733
734#define IDCT16                                                                 \
735  /* Stage2 */                                                                 \
736  {                                                                            \
737    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]);                 \
738    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]);                 \
739    const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);                   \
740    const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);                   \
741    const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]);                 \
742    const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]);                 \
743    const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]);                 \
744    const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]);                 \
745                                                                               \
746    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1,   \
747                           stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14)   \
748                                                                               \
749    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \
750                           stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \
751  }                                                                            \
752                                                                               \
753  /* Stage3 */                                                                 \
754  {                                                                            \
755    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]);                 \
756    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]);                 \
757    const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]);                 \
758    const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]);                 \
759                                                                               \
760    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \
761                           stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6)     \
762                                                                               \
763    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);                                  \
764    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);                                    \
765    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);                                 \
766    stp1_11 = _mm_add_epi16(stp2_11, stp2_10);                                 \
767                                                                               \
768    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13);                               \
769    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);                                 \
770    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);                                 \
771    stp1_15 = _mm_add_epi16(stp2_15, stp2_14);                                 \
772  }                                                                            \
773                                                                               \
774  /* Stage4 */                                                                 \
775  {                                                                            \
776    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]);                   \
777    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]);                   \
778    const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]);                 \
779    const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]);                 \
780                                                                               \
781    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
782    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
783    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
784    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
785                                                                               \
786    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1,   \
787                           stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3)     \
788                                                                               \
789    stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                    \
790    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                    \
791    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                    \
792    stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                    \
793                                                                               \
794    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
795                           stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10,   \
796                           stp2_13)                                            \
797  }                                                                            \
798                                                                               \
799  /* Stage5 */                                                                 \
800  {                                                                            \
801    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
802    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
803                                                                               \
804    stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                    \
805    stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                    \
806    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                    \
807    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                    \
808                                                                               \
809    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
810    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
811    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
812    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
813                                                                               \
814    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
815    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
816    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
817    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
818                                                                               \
819    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
820    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
821    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
822    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
823                                                                               \
824    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
825    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
826                                                                               \
827    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);                                 \
828    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
829    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
830    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);                                \
831                                                                               \
832    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);                               \
833    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
834    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
835    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);                               \
836  }                                                                            \
837                                                                               \
838  /* Stage6 */                                                                 \
839  {                                                                            \
840    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
841    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
842    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
843    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
844                                                                               \
845    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);                                    \
846    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
847    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
848    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);                                    \
849    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);                                    \
850    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
851    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
852    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);                                    \
853                                                                               \
854    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
855                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
856                           stp2_12)                                            \
857  }
858
859#define IDCT16_10                                                              \
860  /* Stage2 */                                                                 \
861  {                                                                            \
862    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero);                   \
863    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero);                   \
864    const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]);                   \
865    const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]);                   \
866                                                                               \
867    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \
868                           stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11,         \
869                           stp1_12_0)                                          \
870  }                                                                            \
871                                                                               \
872  /* Stage3 */                                                                 \
873  {                                                                            \
874    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero);                   \
875    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero);                   \
876                                                                               \
877    MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \
878                                                                               \
879    stp1_9 = stp1_8_0;                                                         \
880    stp1_10 = stp1_11;                                                         \
881                                                                               \
882    stp1_13 = stp1_12_0;                                                       \
883    stp1_14 = stp1_15;                                                         \
884  }                                                                            \
885                                                                               \
886  /* Stage4 */                                                                 \
887  {                                                                            \
888    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);                    \
889    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero);                    \
890                                                                               \
891    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
892    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
893    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
894    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
895                                                                               \
896    MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1)   \
897    stp2_5 = stp2_4;                                                           \
898    stp2_6 = stp2_7;                                                           \
899                                                                               \
900    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
901                           stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10,   \
902                           stp2_13)                                            \
903  }                                                                            \
904                                                                               \
905  /* Stage5 */                                                                 \
906  {                                                                            \
907    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
908    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
909                                                                               \
910    stp1_2 = stp1_1;                                                           \
911    stp1_3 = stp1_0;                                                           \
912                                                                               \
913    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
914    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
915    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
916    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
917                                                                               \
918    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
919    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
920    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
921    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
922                                                                               \
923    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
924    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
925    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
926    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
927                                                                               \
928    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
929    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
930                                                                               \
931    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);                                 \
932    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
933    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
934    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);                                \
935                                                                               \
936    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);                               \
937    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
938    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
939    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);                               \
940  }                                                                            \
941                                                                               \
942  /* Stage6 */                                                                 \
943  {                                                                            \
944    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
945    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
946    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
947    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
948                                                                               \
949    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);                                    \
950    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
951    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
952    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);                                    \
953    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);                                    \
954    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
955    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
956    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);                                    \
957                                                                               \
958    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
959                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
960                           stp2_12)                                            \
961  }
962
963void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
964                                int stride) {
965  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
966  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
967  const __m128i zero = _mm_setzero_si128();
968
969  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
970  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
971  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
972  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
973  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
974  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
975  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
976  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
977
978  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
979  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
980  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
981  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
982
983  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
984  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
985  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
986  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
987  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
988  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
989  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
990  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
991
992  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
993
994  __m128i in[16], l[16], r[16], *curr1;
995  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
996      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
997      stp1_8_0, stp1_12_0;
998  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
999      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
1000  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1001  int i;
1002
1003  curr1 = l;
1004  for (i = 0; i < 2; i++) {
1005    // 1-D idct
1006
1007    // Load input data.
1008    in[0] = load_input_data(input);
1009    in[8] = load_input_data(input + 8 * 1);
1010    in[1] = load_input_data(input + 8 * 2);
1011    in[9] = load_input_data(input + 8 * 3);
1012    in[2] = load_input_data(input + 8 * 4);
1013    in[10] = load_input_data(input + 8 * 5);
1014    in[3] = load_input_data(input + 8 * 6);
1015    in[11] = load_input_data(input + 8 * 7);
1016    in[4] = load_input_data(input + 8 * 8);
1017    in[12] = load_input_data(input + 8 * 9);
1018    in[5] = load_input_data(input + 8 * 10);
1019    in[13] = load_input_data(input + 8 * 11);
1020    in[6] = load_input_data(input + 8 * 12);
1021    in[14] = load_input_data(input + 8 * 13);
1022    in[7] = load_input_data(input + 8 * 14);
1023    in[15] = load_input_data(input + 8 * 15);
1024
1025    array_transpose_8x8(in, in);
1026    array_transpose_8x8(in + 8, in + 8);
1027
1028    IDCT16
1029
1030    // Stage7
1031    curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
1032    curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
1033    curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
1034    curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
1035    curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
1036    curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
1037    curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
1038    curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
1039    curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
1040    curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
1041    curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
1042    curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
1043    curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
1044    curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
1045    curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
1046    curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
1047
1048    curr1 = r;
1049    input += 128;
1050  }
1051  for (i = 0; i < 2; i++) {
1052    int j;
1053    // 1-D idct
1054    array_transpose_8x8(l + i * 8, in);
1055    array_transpose_8x8(r + i * 8, in + 8);
1056
1057    IDCT16
1058
1059    // 2-D
1060    in[0] = _mm_add_epi16(stp2_0, stp1_15);
1061    in[1] = _mm_add_epi16(stp2_1, stp1_14);
1062    in[2] = _mm_add_epi16(stp2_2, stp2_13);
1063    in[3] = _mm_add_epi16(stp2_3, stp2_12);
1064    in[4] = _mm_add_epi16(stp2_4, stp2_11);
1065    in[5] = _mm_add_epi16(stp2_5, stp2_10);
1066    in[6] = _mm_add_epi16(stp2_6, stp1_9);
1067    in[7] = _mm_add_epi16(stp2_7, stp1_8);
1068    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
1069    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
1070    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
1071    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
1072    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
1073    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
1074    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
1075    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
1076
1077    for (j = 0; j < 16; ++j) {
1078      // Final rounding and shift
1079      in[j] = _mm_adds_epi16(in[j], final_rounding);
1080      in[j] = _mm_srai_epi16(in[j], 6);
1081      RECON_AND_STORE(dest + j * stride, in[j]);
1082    }
1083
1084    dest += 8;
1085  }
1086}
1087
1088void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
1089                              int stride) {
1090  __m128i dc_value;
1091  const __m128i zero = _mm_setzero_si128();
1092  int a, i;
1093
1094  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
1095  a = (int)dct_const_round_shift(a * cospi_16_64);
1096  a = ROUND_POWER_OF_TWO(a, 6);
1097
1098  dc_value = _mm_set1_epi16(a);
1099
1100  for (i = 0; i < 16; ++i) {
1101    RECON_AND_STORE(dest + 0, dc_value);
1102    RECON_AND_STORE(dest + 8, dc_value);
1103    dest += stride;
1104  }
1105}
1106
1107static void iadst16_8col(__m128i *in) {
1108  // perform 16x16 1-D ADST for 8 columns
1109  __m128i s[16], x[16], u[32], v[32];
1110  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
1111  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
1112  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
1113  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
1114  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
1115  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
1116  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
1117  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
1118  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
1119  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
1120  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
1121  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
1122  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
1123  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
1124  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
1125  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
1126  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1127  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1128  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1129  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1130  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
1131  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
1132  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1133  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1134  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
1135  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
1136  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
1137  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1138  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1139  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1140  const __m128i kZero = _mm_set1_epi16(0);
1141
1142  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
1143  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
1144  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
1145  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
1146  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
1147  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
1148  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
1149  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
1150  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
1151  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
1152  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
1153  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
1154  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
1155  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
1156  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
1157  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
1158
1159  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
1160  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
1161  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
1162  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
1163  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
1164  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
1165  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
1166  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
1167  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
1168  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
1169  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
1170  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
1171  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
1172  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
1173  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
1174  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
1175  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
1176  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
1177  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
1178  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
1179  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
1180  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
1181  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
1182  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
1183  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
1184  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
1185  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
1186  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
1187  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
1188  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
1189  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
1190  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
1191
1192  u[0] = _mm_add_epi32(v[0], v[16]);
1193  u[1] = _mm_add_epi32(v[1], v[17]);
1194  u[2] = _mm_add_epi32(v[2], v[18]);
1195  u[3] = _mm_add_epi32(v[3], v[19]);
1196  u[4] = _mm_add_epi32(v[4], v[20]);
1197  u[5] = _mm_add_epi32(v[5], v[21]);
1198  u[6] = _mm_add_epi32(v[6], v[22]);
1199  u[7] = _mm_add_epi32(v[7], v[23]);
1200  u[8] = _mm_add_epi32(v[8], v[24]);
1201  u[9] = _mm_add_epi32(v[9], v[25]);
1202  u[10] = _mm_add_epi32(v[10], v[26]);
1203  u[11] = _mm_add_epi32(v[11], v[27]);
1204  u[12] = _mm_add_epi32(v[12], v[28]);
1205  u[13] = _mm_add_epi32(v[13], v[29]);
1206  u[14] = _mm_add_epi32(v[14], v[30]);
1207  u[15] = _mm_add_epi32(v[15], v[31]);
1208  u[16] = _mm_sub_epi32(v[0], v[16]);
1209  u[17] = _mm_sub_epi32(v[1], v[17]);
1210  u[18] = _mm_sub_epi32(v[2], v[18]);
1211  u[19] = _mm_sub_epi32(v[3], v[19]);
1212  u[20] = _mm_sub_epi32(v[4], v[20]);
1213  u[21] = _mm_sub_epi32(v[5], v[21]);
1214  u[22] = _mm_sub_epi32(v[6], v[22]);
1215  u[23] = _mm_sub_epi32(v[7], v[23]);
1216  u[24] = _mm_sub_epi32(v[8], v[24]);
1217  u[25] = _mm_sub_epi32(v[9], v[25]);
1218  u[26] = _mm_sub_epi32(v[10], v[26]);
1219  u[27] = _mm_sub_epi32(v[11], v[27]);
1220  u[28] = _mm_sub_epi32(v[12], v[28]);
1221  u[29] = _mm_sub_epi32(v[13], v[29]);
1222  u[30] = _mm_sub_epi32(v[14], v[30]);
1223  u[31] = _mm_sub_epi32(v[15], v[31]);
1224
1225  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1226  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1227  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1228  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1229  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1230  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1231  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1232  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1233  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1234  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1235  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1236  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1237  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1238  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1239  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1240  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1241  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
1242  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
1243  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
1244  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
1245  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
1246  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
1247  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
1248  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
1249  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
1250  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
1251  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
1252  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
1253  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
1254  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
1255  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
1256  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
1257
1258  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1259  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1260  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1261  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1262  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1263  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1264  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1265  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1266  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1267  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1268  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1269  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1270  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1271  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1272  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1273  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1274  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
1275  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
1276  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
1277  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
1278  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
1279  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
1280  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
1281  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
1282  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
1283  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
1284  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
1285  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
1286  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
1287  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
1288  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
1289  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
1290
1291  s[0] = _mm_packs_epi32(u[0], u[1]);
1292  s[1] = _mm_packs_epi32(u[2], u[3]);
1293  s[2] = _mm_packs_epi32(u[4], u[5]);
1294  s[3] = _mm_packs_epi32(u[6], u[7]);
1295  s[4] = _mm_packs_epi32(u[8], u[9]);
1296  s[5] = _mm_packs_epi32(u[10], u[11]);
1297  s[6] = _mm_packs_epi32(u[12], u[13]);
1298  s[7] = _mm_packs_epi32(u[14], u[15]);
1299  s[8] = _mm_packs_epi32(u[16], u[17]);
1300  s[9] = _mm_packs_epi32(u[18], u[19]);
1301  s[10] = _mm_packs_epi32(u[20], u[21]);
1302  s[11] = _mm_packs_epi32(u[22], u[23]);
1303  s[12] = _mm_packs_epi32(u[24], u[25]);
1304  s[13] = _mm_packs_epi32(u[26], u[27]);
1305  s[14] = _mm_packs_epi32(u[28], u[29]);
1306  s[15] = _mm_packs_epi32(u[30], u[31]);
1307
1308  // stage 2
1309  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
1310  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
1311  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
1312  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
1313  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
1314  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
1315  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1316  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1317
1318  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1319  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1320  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1321  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1322  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1323  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1324  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1325  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1326  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
1327  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
1328  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
1329  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
1330  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
1331  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
1332  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
1333  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
1334
1335  u[0] = _mm_add_epi32(v[0], v[8]);
1336  u[1] = _mm_add_epi32(v[1], v[9]);
1337  u[2] = _mm_add_epi32(v[2], v[10]);
1338  u[3] = _mm_add_epi32(v[3], v[11]);
1339  u[4] = _mm_add_epi32(v[4], v[12]);
1340  u[5] = _mm_add_epi32(v[5], v[13]);
1341  u[6] = _mm_add_epi32(v[6], v[14]);
1342  u[7] = _mm_add_epi32(v[7], v[15]);
1343  u[8] = _mm_sub_epi32(v[0], v[8]);
1344  u[9] = _mm_sub_epi32(v[1], v[9]);
1345  u[10] = _mm_sub_epi32(v[2], v[10]);
1346  u[11] = _mm_sub_epi32(v[3], v[11]);
1347  u[12] = _mm_sub_epi32(v[4], v[12]);
1348  u[13] = _mm_sub_epi32(v[5], v[13]);
1349  u[14] = _mm_sub_epi32(v[6], v[14]);
1350  u[15] = _mm_sub_epi32(v[7], v[15]);
1351
1352  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1353  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1354  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1355  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1356  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1357  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1358  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1359  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1360  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1361  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1362  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1363  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1364  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1365  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1366  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1367  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1368
1369  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1370  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1371  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1372  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1373  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1374  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1375  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1376  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1377  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1378  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1379  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1380  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1381  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1382  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1383  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1384  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1385
1386  x[0] = _mm_add_epi16(s[0], s[4]);
1387  x[1] = _mm_add_epi16(s[1], s[5]);
1388  x[2] = _mm_add_epi16(s[2], s[6]);
1389  x[3] = _mm_add_epi16(s[3], s[7]);
1390  x[4] = _mm_sub_epi16(s[0], s[4]);
1391  x[5] = _mm_sub_epi16(s[1], s[5]);
1392  x[6] = _mm_sub_epi16(s[2], s[6]);
1393  x[7] = _mm_sub_epi16(s[3], s[7]);
1394  x[8] = _mm_packs_epi32(u[0], u[1]);
1395  x[9] = _mm_packs_epi32(u[2], u[3]);
1396  x[10] = _mm_packs_epi32(u[4], u[5]);
1397  x[11] = _mm_packs_epi32(u[6], u[7]);
1398  x[12] = _mm_packs_epi32(u[8], u[9]);
1399  x[13] = _mm_packs_epi32(u[10], u[11]);
1400  x[14] = _mm_packs_epi32(u[12], u[13]);
1401  x[15] = _mm_packs_epi32(u[14], u[15]);
1402
1403  // stage 3
1404  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
1405  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
1406  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
1407  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
1408  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
1409  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
1410  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
1411  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
1412
1413  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
1414  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
1415  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
1416  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
1417  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
1418  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
1419  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1420  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1421  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
1422  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
1423  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
1424  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
1425  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
1426  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
1427  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
1428  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
1429
1430  u[0] = _mm_add_epi32(v[0], v[4]);
1431  u[1] = _mm_add_epi32(v[1], v[5]);
1432  u[2] = _mm_add_epi32(v[2], v[6]);
1433  u[3] = _mm_add_epi32(v[3], v[7]);
1434  u[4] = _mm_sub_epi32(v[0], v[4]);
1435  u[5] = _mm_sub_epi32(v[1], v[5]);
1436  u[6] = _mm_sub_epi32(v[2], v[6]);
1437  u[7] = _mm_sub_epi32(v[3], v[7]);
1438  u[8] = _mm_add_epi32(v[8], v[12]);
1439  u[9] = _mm_add_epi32(v[9], v[13]);
1440  u[10] = _mm_add_epi32(v[10], v[14]);
1441  u[11] = _mm_add_epi32(v[11], v[15]);
1442  u[12] = _mm_sub_epi32(v[8], v[12]);
1443  u[13] = _mm_sub_epi32(v[9], v[13]);
1444  u[14] = _mm_sub_epi32(v[10], v[14]);
1445  u[15] = _mm_sub_epi32(v[11], v[15]);
1446
1447  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1448  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1449  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1450  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1451  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1452  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1453  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1454  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1455  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1456  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1457  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1458  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1459  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1460  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1461  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1462  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1463
1464  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1465  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1466  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1467  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1468  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1469  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1470  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1471  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1472  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1473  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1474  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1475  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1476  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1477  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1478  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1479  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1480
1481  s[0] = _mm_add_epi16(x[0], x[2]);
1482  s[1] = _mm_add_epi16(x[1], x[3]);
1483  s[2] = _mm_sub_epi16(x[0], x[2]);
1484  s[3] = _mm_sub_epi16(x[1], x[3]);
1485  s[4] = _mm_packs_epi32(v[0], v[1]);
1486  s[5] = _mm_packs_epi32(v[2], v[3]);
1487  s[6] = _mm_packs_epi32(v[4], v[5]);
1488  s[7] = _mm_packs_epi32(v[6], v[7]);
1489  s[8] = _mm_add_epi16(x[8], x[10]);
1490  s[9] = _mm_add_epi16(x[9], x[11]);
1491  s[10] = _mm_sub_epi16(x[8], x[10]);
1492  s[11] = _mm_sub_epi16(x[9], x[11]);
1493  s[12] = _mm_packs_epi32(v[8], v[9]);
1494  s[13] = _mm_packs_epi32(v[10], v[11]);
1495  s[14] = _mm_packs_epi32(v[12], v[13]);
1496  s[15] = _mm_packs_epi32(v[14], v[15]);
1497
1498  // stage 4
1499  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
1500  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
1501  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
1502  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
1503  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
1504  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
1505  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1506  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1507
1508  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
1509  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
1510  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
1511  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
1512  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
1513  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
1514  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
1515  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
1516  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
1517  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
1518  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
1519  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
1520  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
1521  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
1522  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
1523  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
1524
1525  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1526  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1527  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1528  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1529  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1530  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1531  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1532  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1533  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1534  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1535  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1536  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1537  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1538  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1539  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1540  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1541
1542  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1543  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1544  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1545  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1546  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1547  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1548  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1549  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1550  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1551  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1552  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1553  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1554  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1555  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1556  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1557  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1558
1559  in[0] = s[0];
1560  in[1] = _mm_sub_epi16(kZero, s[8]);
1561  in[2] = s[12];
1562  in[3] = _mm_sub_epi16(kZero, s[4]);
1563  in[4] = _mm_packs_epi32(v[4], v[5]);
1564  in[5] = _mm_packs_epi32(v[12], v[13]);
1565  in[6] = _mm_packs_epi32(v[8], v[9]);
1566  in[7] = _mm_packs_epi32(v[0], v[1]);
1567  in[8] = _mm_packs_epi32(v[2], v[3]);
1568  in[9] = _mm_packs_epi32(v[10], v[11]);
1569  in[10] = _mm_packs_epi32(v[14], v[15]);
1570  in[11] = _mm_packs_epi32(v[6], v[7]);
1571  in[12] = s[5];
1572  in[13] = _mm_sub_epi16(kZero, s[13]);
1573  in[14] = s[9];
1574  in[15] = _mm_sub_epi16(kZero, s[1]);
1575}
1576
1577static void idct16_8col(__m128i *in) {
1578  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1579  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
1580  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1581  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
1582  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1583  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
1584  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1585  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
1586  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1587  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1588  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1589  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1590  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
1591  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1592  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1593  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1594  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1595  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
1596  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1597  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1598  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1599  __m128i v[16], u[16], s[16], t[16];
1600
1601  // stage 1
1602  s[0] = in[0];
1603  s[1] = in[8];
1604  s[2] = in[4];
1605  s[3] = in[12];
1606  s[4] = in[2];
1607  s[5] = in[10];
1608  s[6] = in[6];
1609  s[7] = in[14];
1610  s[8] = in[1];
1611  s[9] = in[9];
1612  s[10] = in[5];
1613  s[11] = in[13];
1614  s[12] = in[3];
1615  s[13] = in[11];
1616  s[14] = in[7];
1617  s[15] = in[15];
1618
1619  // stage 2
1620  u[0] = _mm_unpacklo_epi16(s[8], s[15]);
1621  u[1] = _mm_unpackhi_epi16(s[8], s[15]);
1622  u[2] = _mm_unpacklo_epi16(s[9], s[14]);
1623  u[3] = _mm_unpackhi_epi16(s[9], s[14]);
1624  u[4] = _mm_unpacklo_epi16(s[10], s[13]);
1625  u[5] = _mm_unpackhi_epi16(s[10], s[13]);
1626  u[6] = _mm_unpacklo_epi16(s[11], s[12]);
1627  u[7] = _mm_unpackhi_epi16(s[11], s[12]);
1628
1629  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
1630  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
1631  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
1632  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
1633  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
1634  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
1635  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
1636  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
1637  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
1638  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
1639  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
1640  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
1641  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
1642  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
1643  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
1644  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
1645
1646  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1647  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1648  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1649  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1650  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1651  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1652  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1653  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1654  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1655  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1656  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1657  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1658  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1659  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1660  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1661  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1662
1663  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1664  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1665  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1666  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1667  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1668  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1669  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1670  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1671  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1672  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1673  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1674  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1675  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1676  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1677  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1678  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1679
1680  s[8] = _mm_packs_epi32(u[0], u[1]);
1681  s[15] = _mm_packs_epi32(u[2], u[3]);
1682  s[9] = _mm_packs_epi32(u[4], u[5]);
1683  s[14] = _mm_packs_epi32(u[6], u[7]);
1684  s[10] = _mm_packs_epi32(u[8], u[9]);
1685  s[13] = _mm_packs_epi32(u[10], u[11]);
1686  s[11] = _mm_packs_epi32(u[12], u[13]);
1687  s[12] = _mm_packs_epi32(u[14], u[15]);
1688
1689  // stage 3
1690  t[0] = s[0];
1691  t[1] = s[1];
1692  t[2] = s[2];
1693  t[3] = s[3];
1694  u[0] = _mm_unpacklo_epi16(s[4], s[7]);
1695  u[1] = _mm_unpackhi_epi16(s[4], s[7]);
1696  u[2] = _mm_unpacklo_epi16(s[5], s[6]);
1697  u[3] = _mm_unpackhi_epi16(s[5], s[6]);
1698
1699  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1700  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1701  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1702  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1703  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1704  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1705  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1706  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1707
1708  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1709  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1710  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1711  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1712  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1713  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1714  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1715  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1716
1717  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1718  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1719  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1720  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1721  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1722  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1723  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1724  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1725
1726  t[4] = _mm_packs_epi32(u[0], u[1]);
1727  t[7] = _mm_packs_epi32(u[2], u[3]);
1728  t[5] = _mm_packs_epi32(u[4], u[5]);
1729  t[6] = _mm_packs_epi32(u[6], u[7]);
1730  t[8] = _mm_add_epi16(s[8], s[9]);
1731  t[9] = _mm_sub_epi16(s[8], s[9]);
1732  t[10] = _mm_sub_epi16(s[11], s[10]);
1733  t[11] = _mm_add_epi16(s[10], s[11]);
1734  t[12] = _mm_add_epi16(s[12], s[13]);
1735  t[13] = _mm_sub_epi16(s[12], s[13]);
1736  t[14] = _mm_sub_epi16(s[15], s[14]);
1737  t[15] = _mm_add_epi16(s[14], s[15]);
1738
1739  // stage 4
1740  u[0] = _mm_unpacklo_epi16(t[0], t[1]);
1741  u[1] = _mm_unpackhi_epi16(t[0], t[1]);
1742  u[2] = _mm_unpacklo_epi16(t[2], t[3]);
1743  u[3] = _mm_unpackhi_epi16(t[2], t[3]);
1744  u[4] = _mm_unpacklo_epi16(t[9], t[14]);
1745  u[5] = _mm_unpackhi_epi16(t[9], t[14]);
1746  u[6] = _mm_unpacklo_epi16(t[10], t[13]);
1747  u[7] = _mm_unpackhi_epi16(t[10], t[13]);
1748
1749  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
1750  v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
1751  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
1752  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
1753  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
1754  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
1755  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1756  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1757  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
1758  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
1759  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
1760  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
1761  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
1762  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
1763  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
1764  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
1765
1766  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1767  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1768  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1769  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1770  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1771  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1772  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1773  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1774  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1775  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1776  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1777  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1778  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1779  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1780  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1781  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1782
1783  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1784  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1785  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1786  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1787  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1788  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1789  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1790  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1791  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1792  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1793  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1794  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1795  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1796  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1797  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1798  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1799
1800  s[0] = _mm_packs_epi32(u[0], u[1]);
1801  s[1] = _mm_packs_epi32(u[2], u[3]);
1802  s[2] = _mm_packs_epi32(u[4], u[5]);
1803  s[3] = _mm_packs_epi32(u[6], u[7]);
1804  s[4] = _mm_add_epi16(t[4], t[5]);
1805  s[5] = _mm_sub_epi16(t[4], t[5]);
1806  s[6] = _mm_sub_epi16(t[7], t[6]);
1807  s[7] = _mm_add_epi16(t[6], t[7]);
1808  s[8] = t[8];
1809  s[15] = t[15];
1810  s[9] = _mm_packs_epi32(u[8], u[9]);
1811  s[14] = _mm_packs_epi32(u[10], u[11]);
1812  s[10] = _mm_packs_epi32(u[12], u[13]);
1813  s[13] = _mm_packs_epi32(u[14], u[15]);
1814  s[11] = t[11];
1815  s[12] = t[12];
1816
1817  // stage 5
1818  t[0] = _mm_add_epi16(s[0], s[3]);
1819  t[1] = _mm_add_epi16(s[1], s[2]);
1820  t[2] = _mm_sub_epi16(s[1], s[2]);
1821  t[3] = _mm_sub_epi16(s[0], s[3]);
1822  t[4] = s[4];
1823  t[7] = s[7];
1824
1825  u[0] = _mm_unpacklo_epi16(s[5], s[6]);
1826  u[1] = _mm_unpackhi_epi16(s[5], s[6]);
1827  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
1828  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
1829  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
1830  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
1831  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1832  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1833  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1834  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1835  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1836  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1837  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1838  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1839  t[5] = _mm_packs_epi32(u[0], u[1]);
1840  t[6] = _mm_packs_epi32(u[2], u[3]);
1841
1842  t[8] = _mm_add_epi16(s[8], s[11]);
1843  t[9] = _mm_add_epi16(s[9], s[10]);
1844  t[10] = _mm_sub_epi16(s[9], s[10]);
1845  t[11] = _mm_sub_epi16(s[8], s[11]);
1846  t[12] = _mm_sub_epi16(s[15], s[12]);
1847  t[13] = _mm_sub_epi16(s[14], s[13]);
1848  t[14] = _mm_add_epi16(s[13], s[14]);
1849  t[15] = _mm_add_epi16(s[12], s[15]);
1850
1851  // stage 6
1852  s[0] = _mm_add_epi16(t[0], t[7]);
1853  s[1] = _mm_add_epi16(t[1], t[6]);
1854  s[2] = _mm_add_epi16(t[2], t[5]);
1855  s[3] = _mm_add_epi16(t[3], t[4]);
1856  s[4] = _mm_sub_epi16(t[3], t[4]);
1857  s[5] = _mm_sub_epi16(t[2], t[5]);
1858  s[6] = _mm_sub_epi16(t[1], t[6]);
1859  s[7] = _mm_sub_epi16(t[0], t[7]);
1860  s[8] = t[8];
1861  s[9] = t[9];
1862
1863  u[0] = _mm_unpacklo_epi16(t[10], t[13]);
1864  u[1] = _mm_unpackhi_epi16(t[10], t[13]);
1865  u[2] = _mm_unpacklo_epi16(t[11], t[12]);
1866  u[3] = _mm_unpackhi_epi16(t[11], t[12]);
1867
1868  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
1869  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
1870  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
1871  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
1872  v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
1873  v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
1874  v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
1875  v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
1876
1877  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1878  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1879  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1880  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1881  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1882  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1883  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1884  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1885
1886  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1887  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1888  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1889  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1890  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1891  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1892  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1893  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1894
1895  s[10] = _mm_packs_epi32(u[0], u[1]);
1896  s[13] = _mm_packs_epi32(u[2], u[3]);
1897  s[11] = _mm_packs_epi32(u[4], u[5]);
1898  s[12] = _mm_packs_epi32(u[6], u[7]);
1899  s[14] = t[14];
1900  s[15] = t[15];
1901
1902  // stage 7
1903  in[0] = _mm_add_epi16(s[0], s[15]);
1904  in[1] = _mm_add_epi16(s[1], s[14]);
1905  in[2] = _mm_add_epi16(s[2], s[13]);
1906  in[3] = _mm_add_epi16(s[3], s[12]);
1907  in[4] = _mm_add_epi16(s[4], s[11]);
1908  in[5] = _mm_add_epi16(s[5], s[10]);
1909  in[6] = _mm_add_epi16(s[6], s[9]);
1910  in[7] = _mm_add_epi16(s[7], s[8]);
1911  in[8] = _mm_sub_epi16(s[7], s[8]);
1912  in[9] = _mm_sub_epi16(s[6], s[9]);
1913  in[10] = _mm_sub_epi16(s[5], s[10]);
1914  in[11] = _mm_sub_epi16(s[4], s[11]);
1915  in[12] = _mm_sub_epi16(s[3], s[12]);
1916  in[13] = _mm_sub_epi16(s[2], s[13]);
1917  in[14] = _mm_sub_epi16(s[1], s[14]);
1918  in[15] = _mm_sub_epi16(s[0], s[15]);
1919}
1920
1921void idct16_sse2(__m128i *in0, __m128i *in1) {
1922  array_transpose_16x16(in0, in1);
1923  idct16_8col(in0);
1924  idct16_8col(in1);
1925}
1926
1927void iadst16_sse2(__m128i *in0, __m128i *in1) {
1928  array_transpose_16x16(in0, in1);
1929  iadst16_8col(in0);
1930  iadst16_8col(in1);
1931}
1932
1933void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
1934                               int stride) {
1935  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
1936  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
1937  const __m128i zero = _mm_setzero_si128();
1938
1939  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1940  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
1941  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1942  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
1943
1944  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1945  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
1946
1947  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
1948  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1949  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1950  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
1951  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1952  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1953
1954  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1955  __m128i in[16], l[16];
1956  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8,
1957      stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0,
1958      stp1_12_0;
1959  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
1960      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
1961  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1962  int i;
1963  // First 1-D inverse DCT
1964  // Load input data.
1965  in[0] = load_input_data(input);
1966  in[1] = load_input_data(input + 8 * 2);
1967  in[2] = load_input_data(input + 8 * 4);
1968  in[3] = load_input_data(input + 8 * 6);
1969
1970  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
1971
1972  // Stage2
1973  {
1974    const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
1975    const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
1976
1977    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
1978    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
1979    tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
1980    tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
1981
1982    tmp0 = _mm_add_epi32(tmp0, rounding);
1983    tmp2 = _mm_add_epi32(tmp2, rounding);
1984    tmp5 = _mm_add_epi32(tmp5, rounding);
1985    tmp7 = _mm_add_epi32(tmp7, rounding);
1986
1987    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1988    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1989    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
1990    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
1991
1992    stp2_8 = _mm_packs_epi32(tmp0, tmp2);
1993    stp2_11 = _mm_packs_epi32(tmp5, tmp7);
1994  }
1995
1996  // Stage3
1997  {
1998    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
1999
2000    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
2001    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
2002
2003    tmp0 = _mm_add_epi32(tmp0, rounding);
2004    tmp2 = _mm_add_epi32(tmp2, rounding);
2005    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2006    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2007
2008    stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
2009    stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
2010
2011    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
2012  }
2013
2014  // Stage4
2015  {
2016    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
2017    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
2018    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
2019
2020    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
2021    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
2022    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
2023    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
2024    tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
2025    tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
2026
2027    tmp0 = _mm_add_epi32(tmp0, rounding);
2028    tmp2 = _mm_add_epi32(tmp2, rounding);
2029    tmp1 = _mm_add_epi32(tmp1, rounding);
2030    tmp3 = _mm_add_epi32(tmp3, rounding);
2031    tmp5 = _mm_add_epi32(tmp5, rounding);
2032    tmp7 = _mm_add_epi32(tmp7, rounding);
2033
2034    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2035    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2036    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2037    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2038    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2039    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2040
2041    stp1_0 = _mm_packs_epi32(tmp0, tmp0);
2042    stp1_1 = _mm_packs_epi32(tmp2, tmp2);
2043    stp2_9 = _mm_packs_epi32(tmp1, tmp3);
2044    stp2_10 = _mm_packs_epi32(tmp5, tmp7);
2045
2046    stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
2047  }
2048
2049  // Stage5 and Stage6
2050  {
2051    tmp0 = _mm_add_epi16(stp2_8, stp2_11);
2052    tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
2053    tmp2 = _mm_add_epi16(stp2_9, stp2_10);
2054    tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
2055
2056    stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
2057    stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
2058    stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
2059    stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
2060
2061    stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
2062    stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
2063    stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
2064    stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
2065  }
2066
2067  // Stage6
2068  {
2069    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
2070    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
2071    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
2072
2073    tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
2074    tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
2075    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
2076    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
2077    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
2078    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
2079
2080    tmp1 = _mm_add_epi32(tmp1, rounding);
2081    tmp3 = _mm_add_epi32(tmp3, rounding);
2082    tmp0 = _mm_add_epi32(tmp0, rounding);
2083    tmp2 = _mm_add_epi32(tmp2, rounding);
2084    tmp4 = _mm_add_epi32(tmp4, rounding);
2085    tmp6 = _mm_add_epi32(tmp6, rounding);
2086
2087    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2088    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2089    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2090    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2091    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2092    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2093
2094    stp1_6 = _mm_packs_epi32(tmp3, tmp1);
2095
2096    stp2_10 = _mm_packs_epi32(tmp0, zero);
2097    stp2_13 = _mm_packs_epi32(tmp2, zero);
2098    stp2_11 = _mm_packs_epi32(tmp4, zero);
2099    stp2_12 = _mm_packs_epi32(tmp6, zero);
2100
2101    tmp0 = _mm_add_epi16(stp1_0, stp1_4);
2102    tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
2103    tmp2 = _mm_add_epi16(stp1_1, stp1_6);
2104    tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
2105
2106    stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
2107    stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
2108    stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
2109    stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
2110    stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
2111    stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
2112    stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
2113    stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
2114  }
2115
2116  // Stage7. Left 8x16 only.
2117  l[0] = _mm_add_epi16(stp2_0, stp1_15);
2118  l[1] = _mm_add_epi16(stp2_1, stp1_14);
2119  l[2] = _mm_add_epi16(stp2_2, stp2_13);
2120  l[3] = _mm_add_epi16(stp2_3, stp2_12);
2121  l[4] = _mm_add_epi16(stp2_4, stp2_11);
2122  l[5] = _mm_add_epi16(stp2_5, stp2_10);
2123  l[6] = _mm_add_epi16(stp2_6, stp1_9);
2124  l[7] = _mm_add_epi16(stp2_7, stp1_8);
2125  l[8] = _mm_sub_epi16(stp2_7, stp1_8);
2126  l[9] = _mm_sub_epi16(stp2_6, stp1_9);
2127  l[10] = _mm_sub_epi16(stp2_5, stp2_10);
2128  l[11] = _mm_sub_epi16(stp2_4, stp2_11);
2129  l[12] = _mm_sub_epi16(stp2_3, stp2_12);
2130  l[13] = _mm_sub_epi16(stp2_2, stp2_13);
2131  l[14] = _mm_sub_epi16(stp2_1, stp1_14);
2132  l[15] = _mm_sub_epi16(stp2_0, stp1_15);
2133
2134  // Second 1-D inverse transform, performed per 8x16 block
2135  for (i = 0; i < 2; i++) {
2136    int j;
2137    array_transpose_4X8(l + 8 * i, in);
2138
2139    IDCT16_10
2140
2141    // Stage7
2142    in[0] = _mm_add_epi16(stp2_0, stp1_15);
2143    in[1] = _mm_add_epi16(stp2_1, stp1_14);
2144    in[2] = _mm_add_epi16(stp2_2, stp2_13);
2145    in[3] = _mm_add_epi16(stp2_3, stp2_12);
2146    in[4] = _mm_add_epi16(stp2_4, stp2_11);
2147    in[5] = _mm_add_epi16(stp2_5, stp2_10);
2148    in[6] = _mm_add_epi16(stp2_6, stp1_9);
2149    in[7] = _mm_add_epi16(stp2_7, stp1_8);
2150    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
2151    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
2152    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
2153    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
2154    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
2155    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
2156    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
2157    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
2158
2159    for (j = 0; j < 16; ++j) {
2160      // Final rounding and shift
2161      in[j] = _mm_adds_epi16(in[j], final_rounding);
2162      in[j] = _mm_srai_epi16(in[j], 6);
2163      RECON_AND_STORE(dest + j * stride, in[j]);
2164    }
2165
2166    dest += 8;
2167  }
2168}
2169
2170#define LOAD_DQCOEFF(reg, input)  \
2171  {                               \
2172    reg = load_input_data(input); \
2173    input += 8;                   \
2174  }
2175
2176#define IDCT32_34                                                              \
2177  /* Stage1 */                                                                 \
2178  {                                                                            \
2179    const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero);                   \
2180    const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero);                   \
2181                                                                               \
2182    const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]);                   \
2183    const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]);                   \
2184                                                                               \
2185    const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero);                   \
2186    const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero);                   \
2187                                                                               \
2188    const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]);                   \
2189    const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]);                   \
2190                                                                               \
2191    MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16,        \
2192                             stp1_31);                                         \
2193    MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19,        \
2194                             stp1_28);                                         \
2195    MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20,        \
2196                             stp1_27);                                         \
2197    MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23,      \
2198                             stp1_24);                                         \
2199  }                                                                            \
2200                                                                               \
2201  /* Stage2 */                                                                 \
2202  {                                                                            \
2203    const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero);                   \
2204    const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero);                   \
2205                                                                               \
2206    const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]);                   \
2207    const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]);                   \
2208                                                                               \
2209    MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8,         \
2210                             stp2_15);                                         \
2211    MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11,        \
2212                             stp2_12);                                         \
2213                                                                               \
2214    stp2_16 = stp1_16;                                                         \
2215    stp2_19 = stp1_19;                                                         \
2216                                                                               \
2217    stp2_20 = stp1_20;                                                         \
2218    stp2_23 = stp1_23;                                                         \
2219                                                                               \
2220    stp2_24 = stp1_24;                                                         \
2221    stp2_27 = stp1_27;                                                         \
2222                                                                               \
2223    stp2_28 = stp1_28;                                                         \
2224    stp2_31 = stp1_31;                                                         \
2225  }                                                                            \
2226                                                                               \
2227  /* Stage3 */                                                                 \
2228  {                                                                            \
2229    const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero);                   \
2230    const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero);                   \
2231                                                                               \
2232    const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31);             \
2233    const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31);             \
2234    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28);             \
2235    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28);             \
2236                                                                               \
2237    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27);             \
2238    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27);             \
2239    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24);             \
2240    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24);             \
2241                                                                               \
2242    MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4,         \
2243                             stp1_7);                                          \
2244                                                                               \
2245    stp1_8 = stp2_8;                                                           \
2246    stp1_11 = stp2_11;                                                         \
2247    stp1_12 = stp2_12;                                                         \
2248    stp1_15 = stp2_15;                                                         \
2249                                                                               \
2250    MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,     \
2251                           stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18,  \
2252                           stp1_29)                                            \
2253    MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,     \
2254                           stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
2255                           stp1_25)                                            \
2256                                                                               \
2257    stp1_16 = stp2_16;                                                         \
2258    stp1_31 = stp2_31;                                                         \
2259    stp1_19 = stp2_19;                                                         \
2260    stp1_20 = stp2_20;                                                         \
2261    stp1_23 = stp2_23;                                                         \
2262    stp1_24 = stp2_24;                                                         \
2263    stp1_27 = stp2_27;                                                         \
2264    stp1_28 = stp2_28;                                                         \
2265  }                                                                            \
2266                                                                               \
2267  /* Stage4 */                                                                 \
2268  {                                                                            \
2269    const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero);                   \
2270    const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero);                   \
2271                                                                               \
2272    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15);               \
2273    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15);               \
2274    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12);             \
2275    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12);             \
2276                                                                               \
2277    MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0,         \
2278                             stp2_1);                                          \
2279                                                                               \
2280    stp2_4 = stp1_4;                                                           \
2281    stp2_5 = stp1_4;                                                           \
2282    stp2_6 = stp1_7;                                                           \
2283    stp2_7 = stp1_7;                                                           \
2284                                                                               \
2285    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
2286                           stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10,   \
2287                           stp2_13)                                            \
2288                                                                               \
2289    stp2_8 = stp1_8;                                                           \
2290    stp2_15 = stp1_15;                                                         \
2291    stp2_11 = stp1_11;                                                         \
2292    stp2_12 = stp1_12;                                                         \
2293                                                                               \
2294    stp2_16 = _mm_add_epi16(stp1_16, stp1_19);                                 \
2295    stp2_17 = _mm_add_epi16(stp1_17, stp1_18);                                 \
2296    stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);                                 \
2297    stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);                                 \
2298    stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);                                 \
2299    stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);                                 \
2300    stp2_22 = _mm_add_epi16(stp1_22, stp1_21);                                 \
2301    stp2_23 = _mm_add_epi16(stp1_23, stp1_20);                                 \
2302                                                                               \
2303    stp2_24 = _mm_add_epi16(stp1_24, stp1_27);                                 \
2304    stp2_25 = _mm_add_epi16(stp1_25, stp1_26);                                 \
2305    stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);                                 \
2306    stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);                                 \
2307    stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);                                 \
2308    stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);                                 \
2309    stp2_30 = _mm_add_epi16(stp1_29, stp1_30);                                 \
2310    stp2_31 = _mm_add_epi16(stp1_28, stp1_31);                                 \
2311  }                                                                            \
2312                                                                               \
2313  /* Stage5 */                                                                 \
2314  {                                                                            \
2315    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
2316    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
2317    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
2318    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
2319                                                                               \
2320    const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);             \
2321    const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);             \
2322    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
2323    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
2324                                                                               \
2325    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
2326    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
2327                                                                               \
2328    stp1_0 = stp2_0;                                                           \
2329    stp1_1 = stp2_1;                                                           \
2330    stp1_2 = stp2_1;                                                           \
2331    stp1_3 = stp2_0;                                                           \
2332                                                                               \
2333    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
2334    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
2335    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
2336    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
2337                                                                               \
2338    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
2339    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
2340    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
2341    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
2342                                                                               \
2343    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
2344    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
2345    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
2346    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
2347                                                                               \
2348    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
2349    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
2350                                                                               \
2351    stp1_4 = stp2_4;                                                           \
2352    stp1_7 = stp2_7;                                                           \
2353                                                                               \
2354    stp1_8 = _mm_add_epi16(stp2_8, stp2_11);                                   \
2355    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
2356    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
2357    stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);                                  \
2358    stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);                                 \
2359    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
2360    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
2361    stp1_15 = _mm_add_epi16(stp2_15, stp2_12);                                 \
2362                                                                               \
2363    stp1_16 = stp2_16;                                                         \
2364    stp1_17 = stp2_17;                                                         \
2365                                                                               \
2366    MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,     \
2367                           stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19,  \
2368                           stp1_28)                                            \
2369    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,     \
2370                           stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21,  \
2371                           stp1_26)                                            \
2372                                                                               \
2373    stp1_22 = stp2_22;                                                         \
2374    stp1_23 = stp2_23;                                                         \
2375    stp1_24 = stp2_24;                                                         \
2376    stp1_25 = stp2_25;                                                         \
2377    stp1_30 = stp2_30;                                                         \
2378    stp1_31 = stp2_31;                                                         \
2379  }                                                                            \
2380                                                                               \
2381  /* Stage6 */                                                                 \
2382  {                                                                            \
2383    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
2384    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
2385    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
2386    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
2387                                                                               \
2388    stp2_0 = _mm_add_epi16(stp1_0, stp1_7);                                    \
2389    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
2390    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
2391    stp2_3 = _mm_add_epi16(stp1_3, stp1_4);                                    \
2392    stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);                                    \
2393    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
2394    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
2395    stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);                                    \
2396                                                                               \
2397    stp2_8 = stp1_8;                                                           \
2398    stp2_9 = stp1_9;                                                           \
2399    stp2_14 = stp1_14;                                                         \
2400    stp2_15 = stp1_15;                                                         \
2401                                                                               \
2402    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
2403                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
2404                           stp2_12)                                            \
2405                                                                               \
2406    stp2_16 = _mm_add_epi16(stp1_16, stp1_23);                                 \
2407    stp2_17 = _mm_add_epi16(stp1_17, stp1_22);                                 \
2408    stp2_18 = _mm_add_epi16(stp1_18, stp1_21);                                 \
2409    stp2_19 = _mm_add_epi16(stp1_19, stp1_20);                                 \
2410    stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);                                 \
2411    stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);                                 \
2412    stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);                                 \
2413    stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);                                 \
2414                                                                               \
2415    stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);                                 \
2416    stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);                                 \
2417    stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);                                 \
2418    stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);                                 \
2419    stp2_28 = _mm_add_epi16(stp1_27, stp1_28);                                 \
2420    stp2_29 = _mm_add_epi16(stp1_26, stp1_29);                                 \
2421    stp2_30 = _mm_add_epi16(stp1_25, stp1_30);                                 \
2422    stp2_31 = _mm_add_epi16(stp1_24, stp1_31);                                 \
2423  }                                                                            \
2424                                                                               \
2425  /* Stage7 */                                                                 \
2426  {                                                                            \
2427    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
2428    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
2429    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
2430    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
2431                                                                               \
2432    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
2433    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
2434    const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);             \
2435    const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);             \
2436                                                                               \
2437    stp1_0 = _mm_add_epi16(stp2_0, stp2_15);                                   \
2438    stp1_1 = _mm_add_epi16(stp2_1, stp2_14);                                   \
2439    stp1_2 = _mm_add_epi16(stp2_2, stp2_13);                                   \
2440    stp1_3 = _mm_add_epi16(stp2_3, stp2_12);                                   \
2441    stp1_4 = _mm_add_epi16(stp2_4, stp2_11);                                   \
2442    stp1_5 = _mm_add_epi16(stp2_5, stp2_10);                                   \
2443    stp1_6 = _mm_add_epi16(stp2_6, stp2_9);                                    \
2444    stp1_7 = _mm_add_epi16(stp2_7, stp2_8);                                    \
2445    stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);                                    \
2446    stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);                                    \
2447    stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);                                  \
2448    stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);                                  \
2449    stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);                                  \
2450    stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);                                  \
2451    stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);                                  \
2452    stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);                                  \
2453                                                                               \
2454    stp1_16 = stp2_16;                                                         \
2455    stp1_17 = stp2_17;                                                         \
2456    stp1_18 = stp2_18;                                                         \
2457    stp1_19 = stp2_19;                                                         \
2458                                                                               \
2459    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,     \
2460                           stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21,  \
2461                           stp1_26)                                            \
2462    MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,     \
2463                           stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23,  \
2464                           stp1_24)                                            \
2465                                                                               \
2466    stp1_28 = stp2_28;                                                         \
2467    stp1_29 = stp2_29;                                                         \
2468    stp1_30 = stp2_30;                                                         \
2469    stp1_31 = stp2_31;                                                         \
2470  }
2471
2472#define IDCT32                                                                 \
2473  /* Stage1 */                                                                 \
2474  {                                                                            \
2475    const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]);                 \
2476    const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]);                 \
2477    const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]);               \
2478    const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]);               \
2479                                                                               \
2480    const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]);                 \
2481    const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]);                 \
2482    const __m128i lo_25_7 = _mm_unpacklo_epi16(in[25], in[7]);                 \
2483    const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]);                 \
2484                                                                               \
2485    const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]);                 \
2486    const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]);                 \
2487    const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]);               \
2488    const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]);               \
2489                                                                               \
2490    const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]);               \
2491    const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]);               \
2492    const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]);                 \
2493    const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]);                 \
2494                                                                               \
2495    MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0,       \
2496                           stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17,  \
2497                           stp1_30)                                            \
2498    MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \
2499                           stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \
2500    MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8,       \
2501                           stg1_9, stg1_10, stg1_11, stp1_20, stp1_27,         \
2502                           stp1_21, stp1_26)                                   \
2503    MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12,      \
2504                           stg1_13, stg1_14, stg1_15, stp1_22, stp1_25,        \
2505                           stp1_23, stp1_24)                                   \
2506  }                                                                            \
2507                                                                               \
2508  /* Stage2 */                                                                 \
2509  {                                                                            \
2510    const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]);                 \
2511    const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]);                 \
2512    const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]);               \
2513    const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]);               \
2514                                                                               \
2515    const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]);               \
2516    const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]);               \
2517    const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]);                 \
2518    const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]);                 \
2519                                                                               \
2520    MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0,       \
2521                           stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9,    \
2522                           stp2_14)                                            \
2523    MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4,       \
2524                           stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11,  \
2525                           stp2_12)                                            \
2526                                                                               \
2527    stp2_16 = _mm_add_epi16(stp1_16, stp1_17);                                 \
2528    stp2_17 = _mm_sub_epi16(stp1_16, stp1_17);                                 \
2529    stp2_18 = _mm_sub_epi16(stp1_19, stp1_18);                                 \
2530    stp2_19 = _mm_add_epi16(stp1_19, stp1_18);                                 \
2531                                                                               \
2532    stp2_20 = _mm_add_epi16(stp1_20, stp1_21);                                 \
2533    stp2_21 = _mm_sub_epi16(stp1_20, stp1_21);                                 \
2534    stp2_22 = _mm_sub_epi16(stp1_23, stp1_22);                                 \
2535    stp2_23 = _mm_add_epi16(stp1_23, stp1_22);                                 \
2536                                                                               \
2537    stp2_24 = _mm_add_epi16(stp1_24, stp1_25);                                 \
2538    stp2_25 = _mm_sub_epi16(stp1_24, stp1_25);                                 \
2539    stp2_26 = _mm_sub_epi16(stp1_27, stp1_26);                                 \
2540    stp2_27 = _mm_add_epi16(stp1_27, stp1_26);                                 \
2541                                                                               \
2542    stp2_28 = _mm_add_epi16(stp1_28, stp1_29);                                 \
2543    stp2_29 = _mm_sub_epi16(stp1_28, stp1_29);                                 \
2544    stp2_30 = _mm_sub_epi16(stp1_31, stp1_30);                                 \
2545    stp2_31 = _mm_add_epi16(stp1_31, stp1_30);                                 \
2546  }                                                                            \
2547                                                                               \
2548  /* Stage3 */                                                                 \
2549  {                                                                            \
2550    const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]);                 \
2551    const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]);                 \
2552    const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]);               \
2553    const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]);               \
2554                                                                               \
2555    const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30);             \
2556    const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30);             \
2557    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
2558    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
2559                                                                               \
2560    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
2561    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
2562    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
2563    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
2564                                                                               \
2565    MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0,       \
2566                           stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5,     \
2567                           stp1_6)                                             \
2568                                                                               \
2569    stp1_8 = _mm_add_epi16(stp2_8, stp2_9);                                    \
2570    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);                                    \
2571    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);                                 \
2572    stp1_11 = _mm_add_epi16(stp2_11, stp2_10);                                 \
2573    stp1_12 = _mm_add_epi16(stp2_12, stp2_13);                                 \
2574    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);                                 \
2575    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);                                 \
2576    stp1_15 = _mm_add_epi16(stp2_15, stp2_14);                                 \
2577                                                                               \
2578    MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,     \
2579                           stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18,  \
2580                           stp1_29)                                            \
2581    MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,     \
2582                           stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
2583                           stp1_25)                                            \
2584                                                                               \
2585    stp1_16 = stp2_16;                                                         \
2586    stp1_31 = stp2_31;                                                         \
2587    stp1_19 = stp2_19;                                                         \
2588    stp1_20 = stp2_20;                                                         \
2589    stp1_23 = stp2_23;                                                         \
2590    stp1_24 = stp2_24;                                                         \
2591    stp1_27 = stp2_27;                                                         \
2592    stp1_28 = stp2_28;                                                         \
2593  }                                                                            \
2594                                                                               \
2595  /* Stage4 */                                                                 \
2596  {                                                                            \
2597    const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]);                 \
2598    const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]);                 \
2599    const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]);                 \
2600    const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]);                 \
2601                                                                               \
2602    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
2603    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
2604    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
2605    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
2606                                                                               \
2607    MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \
2608                           stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3)     \
2609                                                                               \
2610    stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                    \
2611    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                    \
2612    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                    \
2613    stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                    \
2614                                                                               \
2615    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
2616                           stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10,   \
2617                           stp2_13)                                            \
2618                                                                               \
2619    stp2_8 = stp1_8;                                                           \
2620    stp2_15 = stp1_15;                                                         \
2621    stp2_11 = stp1_11;                                                         \
2622    stp2_12 = stp1_12;                                                         \
2623                                                                               \
2624    stp2_16 = _mm_add_epi16(stp1_16, stp1_19);                                 \
2625    stp2_17 = _mm_add_epi16(stp1_17, stp1_18);                                 \
2626    stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);                                 \
2627    stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);                                 \
2628    stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);                                 \
2629    stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);                                 \
2630    stp2_22 = _mm_add_epi16(stp1_22, stp1_21);                                 \
2631    stp2_23 = _mm_add_epi16(stp1_23, stp1_20);                                 \
2632                                                                               \
2633    stp2_24 = _mm_add_epi16(stp1_24, stp1_27);                                 \
2634    stp2_25 = _mm_add_epi16(stp1_25, stp1_26);                                 \
2635    stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);                                 \
2636    stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);                                 \
2637    stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);                                 \
2638    stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);                                 \
2639    stp2_30 = _mm_add_epi16(stp1_29, stp1_30);                                 \
2640    stp2_31 = _mm_add_epi16(stp1_28, stp1_31);                                 \
2641  }                                                                            \
2642                                                                               \
2643  /* Stage5 */                                                                 \
2644  {                                                                            \
2645    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
2646    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
2647    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
2648    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
2649                                                                               \
2650    const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);             \
2651    const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);             \
2652    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
2653    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
2654                                                                               \
2655    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
2656    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
2657                                                                               \
2658    stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                    \
2659    stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                    \
2660    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                    \
2661    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                    \
2662                                                                               \
2663    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
2664    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
2665    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
2666    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
2667                                                                               \
2668    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
2669    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
2670    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
2671    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
2672                                                                               \
2673    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
2674    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
2675    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
2676    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
2677                                                                               \
2678    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
2679    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
2680                                                                               \
2681    stp1_4 = stp2_4;                                                           \
2682    stp1_7 = stp2_7;                                                           \
2683                                                                               \
2684    stp1_8 = _mm_add_epi16(stp2_8, stp2_11);                                   \
2685    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
2686    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
2687    stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);                                  \
2688    stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);                                 \
2689    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
2690    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
2691    stp1_15 = _mm_add_epi16(stp2_15, stp2_12);                                 \
2692                                                                               \
2693    stp1_16 = stp2_16;                                                         \
2694    stp1_17 = stp2_17;                                                         \
2695                                                                               \
2696    MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,     \
2697                           stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19,  \
2698                           stp1_28)                                            \
2699    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,     \
2700                           stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21,  \
2701                           stp1_26)                                            \
2702                                                                               \
2703    stp1_22 = stp2_22;                                                         \
2704    stp1_23 = stp2_23;                                                         \
2705    stp1_24 = stp2_24;                                                         \
2706    stp1_25 = stp2_25;                                                         \
2707    stp1_30 = stp2_30;                                                         \
2708    stp1_31 = stp2_31;                                                         \
2709  }                                                                            \
2710                                                                               \
2711  /* Stage6 */                                                                 \
2712  {                                                                            \
2713    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
2714    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
2715    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
2716    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
2717                                                                               \
2718    stp2_0 = _mm_add_epi16(stp1_0, stp1_7);                                    \
2719    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
2720    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
2721    stp2_3 = _mm_add_epi16(stp1_3, stp1_4);                                    \
2722    stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);                                    \
2723    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
2724    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
2725    stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);                                    \
2726                                                                               \
2727    stp2_8 = stp1_8;                                                           \
2728    stp2_9 = stp1_9;                                                           \
2729    stp2_14 = stp1_14;                                                         \
2730    stp2_15 = stp1_15;                                                         \
2731                                                                               \
2732    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
2733                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
2734                           stp2_12)                                            \
2735                                                                               \
2736    stp2_16 = _mm_add_epi16(stp1_16, stp1_23);                                 \
2737    stp2_17 = _mm_add_epi16(stp1_17, stp1_22);                                 \
2738    stp2_18 = _mm_add_epi16(stp1_18, stp1_21);                                 \
2739    stp2_19 = _mm_add_epi16(stp1_19, stp1_20);                                 \
2740    stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);                                 \
2741    stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);                                 \
2742    stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);                                 \
2743    stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);                                 \
2744                                                                               \
2745    stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);                                 \
2746    stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);                                 \
2747    stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);                                 \
2748    stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);                                 \
2749    stp2_28 = _mm_add_epi16(stp1_27, stp1_28);                                 \
2750    stp2_29 = _mm_add_epi16(stp1_26, stp1_29);                                 \
2751    stp2_30 = _mm_add_epi16(stp1_25, stp1_30);                                 \
2752    stp2_31 = _mm_add_epi16(stp1_24, stp1_31);                                 \
2753  }                                                                            \
2754                                                                               \
2755  /* Stage7 */                                                                 \
2756  {                                                                            \
2757    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
2758    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
2759    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
2760    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
2761                                                                               \
2762    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
2763    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
2764    const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);             \
2765    const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);             \
2766                                                                               \
2767    stp1_0 = _mm_add_epi16(stp2_0, stp2_15);                                   \
2768    stp1_1 = _mm_add_epi16(stp2_1, stp2_14);                                   \
2769    stp1_2 = _mm_add_epi16(stp2_2, stp2_13);                                   \
2770    stp1_3 = _mm_add_epi16(stp2_3, stp2_12);                                   \
2771    stp1_4 = _mm_add_epi16(stp2_4, stp2_11);                                   \
2772    stp1_5 = _mm_add_epi16(stp2_5, stp2_10);                                   \
2773    stp1_6 = _mm_add_epi16(stp2_6, stp2_9);                                    \
2774    stp1_7 = _mm_add_epi16(stp2_7, stp2_8);                                    \
2775    stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);                                    \
2776    stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);                                    \
2777    stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);                                  \
2778    stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);                                  \
2779    stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);                                  \
2780    stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);                                  \
2781    stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);                                  \
2782    stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);                                  \
2783                                                                               \
2784    stp1_16 = stp2_16;                                                         \
2785    stp1_17 = stp2_17;                                                         \
2786    stp1_18 = stp2_18;                                                         \
2787    stp1_19 = stp2_19;                                                         \
2788                                                                               \
2789    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,     \
2790                           stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21,  \
2791                           stp1_26)                                            \
2792    MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,     \
2793                           stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23,  \
2794                           stp1_24)                                            \
2795                                                                               \
2796    stp1_28 = stp2_28;                                                         \
2797    stp1_29 = stp2_29;                                                         \
2798    stp1_30 = stp2_30;                                                         \
2799    stp1_31 = stp2_31;                                                         \
2800  }
2801
2802// Only upper-left 8x8 has non-zero coeff
2803void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
2804                               int stride) {
2805  const __m128i zero = _mm_setzero_si128();
2806  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
2807  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
2808
2809  // idct constants for each stage
2810  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
2811  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
2812  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
2813  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
2814  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
2815  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
2816  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
2817  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
2818
2819  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2820  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
2821  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2822  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
2823
2824  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2825  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
2826  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
2827  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
2828  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
2829  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
2830  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
2831  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
2832
2833  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
2834  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2835  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2836  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
2837  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
2838
2839  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
2840
2841  __m128i in[32], col[32];
2842  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
2843      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
2844      stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
2845      stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
2846  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
2847      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
2848      stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
2849      stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
2850  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2851  int i;
2852
2853  // Load input data. Only need to load the top left 8x8 block.
2854  in[0] = load_input_data(input);
2855  in[1] = load_input_data(input + 32);
2856  in[2] = load_input_data(input + 64);
2857  in[3] = load_input_data(input + 96);
2858  in[4] = load_input_data(input + 128);
2859  in[5] = load_input_data(input + 160);
2860  in[6] = load_input_data(input + 192);
2861  in[7] = load_input_data(input + 224);
2862
2863  array_transpose_8x8(in, in);
2864  IDCT32_34
2865
2866  // 1_D: Store 32 intermediate results for each 8x32 block.
2867  col[0] = _mm_add_epi16(stp1_0, stp1_31);
2868  col[1] = _mm_add_epi16(stp1_1, stp1_30);
2869  col[2] = _mm_add_epi16(stp1_2, stp1_29);
2870  col[3] = _mm_add_epi16(stp1_3, stp1_28);
2871  col[4] = _mm_add_epi16(stp1_4, stp1_27);
2872  col[5] = _mm_add_epi16(stp1_5, stp1_26);
2873  col[6] = _mm_add_epi16(stp1_6, stp1_25);
2874  col[7] = _mm_add_epi16(stp1_7, stp1_24);
2875  col[8] = _mm_add_epi16(stp1_8, stp1_23);
2876  col[9] = _mm_add_epi16(stp1_9, stp1_22);
2877  col[10] = _mm_add_epi16(stp1_10, stp1_21);
2878  col[11] = _mm_add_epi16(stp1_11, stp1_20);
2879  col[12] = _mm_add_epi16(stp1_12, stp1_19);
2880  col[13] = _mm_add_epi16(stp1_13, stp1_18);
2881  col[14] = _mm_add_epi16(stp1_14, stp1_17);
2882  col[15] = _mm_add_epi16(stp1_15, stp1_16);
2883  col[16] = _mm_sub_epi16(stp1_15, stp1_16);
2884  col[17] = _mm_sub_epi16(stp1_14, stp1_17);
2885  col[18] = _mm_sub_epi16(stp1_13, stp1_18);
2886  col[19] = _mm_sub_epi16(stp1_12, stp1_19);
2887  col[20] = _mm_sub_epi16(stp1_11, stp1_20);
2888  col[21] = _mm_sub_epi16(stp1_10, stp1_21);
2889  col[22] = _mm_sub_epi16(stp1_9, stp1_22);
2890  col[23] = _mm_sub_epi16(stp1_8, stp1_23);
2891  col[24] = _mm_sub_epi16(stp1_7, stp1_24);
2892  col[25] = _mm_sub_epi16(stp1_6, stp1_25);
2893  col[26] = _mm_sub_epi16(stp1_5, stp1_26);
2894  col[27] = _mm_sub_epi16(stp1_4, stp1_27);
2895  col[28] = _mm_sub_epi16(stp1_3, stp1_28);
2896  col[29] = _mm_sub_epi16(stp1_2, stp1_29);
2897  col[30] = _mm_sub_epi16(stp1_1, stp1_30);
2898  col[31] = _mm_sub_epi16(stp1_0, stp1_31);
2899  for (i = 0; i < 4; i++) {
2900    int j;
2901    // Transpose 32x8 block to 8x32 block
2902    array_transpose_8x8(col + i * 8, in);
2903    IDCT32_34
2904
2905    // 2_D: Calculate the results and store them to destination.
2906    in[0] = _mm_add_epi16(stp1_0, stp1_31);
2907    in[1] = _mm_add_epi16(stp1_1, stp1_30);
2908    in[2] = _mm_add_epi16(stp1_2, stp1_29);
2909    in[3] = _mm_add_epi16(stp1_3, stp1_28);
2910    in[4] = _mm_add_epi16(stp1_4, stp1_27);
2911    in[5] = _mm_add_epi16(stp1_5, stp1_26);
2912    in[6] = _mm_add_epi16(stp1_6, stp1_25);
2913    in[7] = _mm_add_epi16(stp1_7, stp1_24);
2914    in[8] = _mm_add_epi16(stp1_8, stp1_23);
2915    in[9] = _mm_add_epi16(stp1_9, stp1_22);
2916    in[10] = _mm_add_epi16(stp1_10, stp1_21);
2917    in[11] = _mm_add_epi16(stp1_11, stp1_20);
2918    in[12] = _mm_add_epi16(stp1_12, stp1_19);
2919    in[13] = _mm_add_epi16(stp1_13, stp1_18);
2920    in[14] = _mm_add_epi16(stp1_14, stp1_17);
2921    in[15] = _mm_add_epi16(stp1_15, stp1_16);
2922    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
2923    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
2924    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
2925    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
2926    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
2927    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
2928    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
2929    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
2930    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
2931    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
2932    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
2933    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
2934    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
2935    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
2936    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
2937    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
2938
2939    for (j = 0; j < 32; ++j) {
2940      // Final rounding and shift
2941      in[j] = _mm_adds_epi16(in[j], final_rounding);
2942      in[j] = _mm_srai_epi16(in[j], 6);
2943      RECON_AND_STORE(dest + j * stride, in[j]);
2944    }
2945
2946    dest += 8;
2947  }
2948}
2949
2950void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
2951                                 int stride) {
2952  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
2953  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
2954  const __m128i zero = _mm_setzero_si128();
2955
2956  // idct constants for each stage
2957  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
2958  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
2959  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
2960  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
2961  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
2962  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
2963  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
2964  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
2965  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
2966  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
2967  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
2968  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
2969  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
2970  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
2971  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
2972  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
2973
2974  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2975  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
2976  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
2977  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
2978  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
2979  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
2980  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2981  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
2982
2983  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2984  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
2985  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
2986  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
2987  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
2988  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
2989  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
2990  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
2991  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
2992  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
2993
2994  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
2995  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2996  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
2997  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
2998  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2999  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3000  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3001
3002  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3003
3004  __m128i in[32], col[128], zero_idx[16];
3005  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3006      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3007      stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
3008      stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
3009  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3010      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3011      stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
3012      stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
3013  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3014  int i, j, i32;
3015
3016  for (i = 0; i < 4; i++) {
3017    i32 = (i << 5);
3018    // First 1-D idct
3019    // Load input data.
3020    LOAD_DQCOEFF(in[0], input);
3021    LOAD_DQCOEFF(in[8], input);
3022    LOAD_DQCOEFF(in[16], input);
3023    LOAD_DQCOEFF(in[24], input);
3024    LOAD_DQCOEFF(in[1], input);
3025    LOAD_DQCOEFF(in[9], input);
3026    LOAD_DQCOEFF(in[17], input);
3027    LOAD_DQCOEFF(in[25], input);
3028    LOAD_DQCOEFF(in[2], input);
3029    LOAD_DQCOEFF(in[10], input);
3030    LOAD_DQCOEFF(in[18], input);
3031    LOAD_DQCOEFF(in[26], input);
3032    LOAD_DQCOEFF(in[3], input);
3033    LOAD_DQCOEFF(in[11], input);
3034    LOAD_DQCOEFF(in[19], input);
3035    LOAD_DQCOEFF(in[27], input);
3036
3037    LOAD_DQCOEFF(in[4], input);
3038    LOAD_DQCOEFF(in[12], input);
3039    LOAD_DQCOEFF(in[20], input);
3040    LOAD_DQCOEFF(in[28], input);
3041    LOAD_DQCOEFF(in[5], input);
3042    LOAD_DQCOEFF(in[13], input);
3043    LOAD_DQCOEFF(in[21], input);
3044    LOAD_DQCOEFF(in[29], input);
3045    LOAD_DQCOEFF(in[6], input);
3046    LOAD_DQCOEFF(in[14], input);
3047    LOAD_DQCOEFF(in[22], input);
3048    LOAD_DQCOEFF(in[30], input);
3049    LOAD_DQCOEFF(in[7], input);
3050    LOAD_DQCOEFF(in[15], input);
3051    LOAD_DQCOEFF(in[23], input);
3052    LOAD_DQCOEFF(in[31], input);
3053
3054    // checking if all entries are zero
3055    zero_idx[0] = _mm_or_si128(in[0], in[1]);
3056    zero_idx[1] = _mm_or_si128(in[2], in[3]);
3057    zero_idx[2] = _mm_or_si128(in[4], in[5]);
3058    zero_idx[3] = _mm_or_si128(in[6], in[7]);
3059    zero_idx[4] = _mm_or_si128(in[8], in[9]);
3060    zero_idx[5] = _mm_or_si128(in[10], in[11]);
3061    zero_idx[6] = _mm_or_si128(in[12], in[13]);
3062    zero_idx[7] = _mm_or_si128(in[14], in[15]);
3063    zero_idx[8] = _mm_or_si128(in[16], in[17]);
3064    zero_idx[9] = _mm_or_si128(in[18], in[19]);
3065    zero_idx[10] = _mm_or_si128(in[20], in[21]);
3066    zero_idx[11] = _mm_or_si128(in[22], in[23]);
3067    zero_idx[12] = _mm_or_si128(in[24], in[25]);
3068    zero_idx[13] = _mm_or_si128(in[26], in[27]);
3069    zero_idx[14] = _mm_or_si128(in[28], in[29]);
3070    zero_idx[15] = _mm_or_si128(in[30], in[31]);
3071
3072    zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3073    zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3074    zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3075    zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3076    zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3077    zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3078    zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3079    zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
3080
3081    zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3082    zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3083    zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3084    zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3085    zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3086    zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3087    zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3088
3089    if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
3090      col[i32 + 0] = _mm_setzero_si128();
3091      col[i32 + 1] = _mm_setzero_si128();
3092      col[i32 + 2] = _mm_setzero_si128();
3093      col[i32 + 3] = _mm_setzero_si128();
3094      col[i32 + 4] = _mm_setzero_si128();
3095      col[i32 + 5] = _mm_setzero_si128();
3096      col[i32 + 6] = _mm_setzero_si128();
3097      col[i32 + 7] = _mm_setzero_si128();
3098      col[i32 + 8] = _mm_setzero_si128();
3099      col[i32 + 9] = _mm_setzero_si128();
3100      col[i32 + 10] = _mm_setzero_si128();
3101      col[i32 + 11] = _mm_setzero_si128();
3102      col[i32 + 12] = _mm_setzero_si128();
3103      col[i32 + 13] = _mm_setzero_si128();
3104      col[i32 + 14] = _mm_setzero_si128();
3105      col[i32 + 15] = _mm_setzero_si128();
3106      col[i32 + 16] = _mm_setzero_si128();
3107      col[i32 + 17] = _mm_setzero_si128();
3108      col[i32 + 18] = _mm_setzero_si128();
3109      col[i32 + 19] = _mm_setzero_si128();
3110      col[i32 + 20] = _mm_setzero_si128();
3111      col[i32 + 21] = _mm_setzero_si128();
3112      col[i32 + 22] = _mm_setzero_si128();
3113      col[i32 + 23] = _mm_setzero_si128();
3114      col[i32 + 24] = _mm_setzero_si128();
3115      col[i32 + 25] = _mm_setzero_si128();
3116      col[i32 + 26] = _mm_setzero_si128();
3117      col[i32 + 27] = _mm_setzero_si128();
3118      col[i32 + 28] = _mm_setzero_si128();
3119      col[i32 + 29] = _mm_setzero_si128();
3120      col[i32 + 30] = _mm_setzero_si128();
3121      col[i32 + 31] = _mm_setzero_si128();
3122      continue;
3123    }
3124
3125    // Transpose 32x8 block to 8x32 block
3126    array_transpose_8x8(in, in);
3127    array_transpose_8x8(in + 8, in + 8);
3128    array_transpose_8x8(in + 16, in + 16);
3129    array_transpose_8x8(in + 24, in + 24);
3130
3131    IDCT32
3132
3133    // 1_D: Store 32 intermediate results for each 8x32 block.
3134    col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
3135    col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
3136    col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
3137    col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
3138    col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
3139    col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
3140    col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
3141    col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
3142    col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
3143    col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
3144    col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
3145    col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
3146    col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
3147    col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
3148    col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
3149    col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
3150    col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
3151    col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
3152    col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
3153    col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
3154    col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
3155    col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
3156    col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
3157    col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
3158    col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
3159    col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
3160    col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
3161    col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
3162    col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
3163    col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
3164    col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
3165    col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
3166  }
3167  for (i = 0; i < 4; i++) {
3168    // Second 1-D idct
3169    j = i << 3;
3170
3171    // Transpose 32x8 block to 8x32 block
3172    array_transpose_8x8(col + j, in);
3173    array_transpose_8x8(col + j + 32, in + 8);
3174    array_transpose_8x8(col + j + 64, in + 16);
3175    array_transpose_8x8(col + j + 96, in + 24);
3176
3177    IDCT32
3178
3179    // 2_D: Calculate the results and store them to destination.
3180    in[0] = _mm_add_epi16(stp1_0, stp1_31);
3181    in[1] = _mm_add_epi16(stp1_1, stp1_30);
3182    in[2] = _mm_add_epi16(stp1_2, stp1_29);
3183    in[3] = _mm_add_epi16(stp1_3, stp1_28);
3184    in[4] = _mm_add_epi16(stp1_4, stp1_27);
3185    in[5] = _mm_add_epi16(stp1_5, stp1_26);
3186    in[6] = _mm_add_epi16(stp1_6, stp1_25);
3187    in[7] = _mm_add_epi16(stp1_7, stp1_24);
3188    in[8] = _mm_add_epi16(stp1_8, stp1_23);
3189    in[9] = _mm_add_epi16(stp1_9, stp1_22);
3190    in[10] = _mm_add_epi16(stp1_10, stp1_21);
3191    in[11] = _mm_add_epi16(stp1_11, stp1_20);
3192    in[12] = _mm_add_epi16(stp1_12, stp1_19);
3193    in[13] = _mm_add_epi16(stp1_13, stp1_18);
3194    in[14] = _mm_add_epi16(stp1_14, stp1_17);
3195    in[15] = _mm_add_epi16(stp1_15, stp1_16);
3196    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
3197    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
3198    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
3199    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
3200    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
3201    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
3202    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
3203    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
3204    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
3205    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
3206    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
3207    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
3208    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
3209    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
3210    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
3211    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
3212
3213    for (j = 0; j < 32; ++j) {
3214      // Final rounding and shift
3215      in[j] = _mm_adds_epi16(in[j], final_rounding);
3216      in[j] = _mm_srai_epi16(in[j], 6);
3217      RECON_AND_STORE(dest + j * stride, in[j]);
3218    }
3219
3220    dest += 8;
3221  }
3222}
3223
3224void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
3225                              int stride) {
3226  __m128i dc_value;
3227  const __m128i zero = _mm_setzero_si128();
3228  int a, j;
3229
3230  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
3231  a = (int)dct_const_round_shift(a * cospi_16_64);
3232  a = ROUND_POWER_OF_TWO(a, 6);
3233
3234  dc_value = _mm_set1_epi16(a);
3235
3236  for (j = 0; j < 32; ++j) {
3237    RECON_AND_STORE(dest + 0 + j * stride, dc_value);
3238    RECON_AND_STORE(dest + 8 + j * stride, dc_value);
3239    RECON_AND_STORE(dest + 16 + j * stride, dc_value);
3240    RECON_AND_STORE(dest + 24 + j * stride, dc_value);
3241  }
3242}
3243