vp9_dct_sse2.c revision 91037db265ecdd914a26e056cf69207b4f50924e
1/*
2 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <emmintrin.h>  // SSE2
12#include "vp9/common/vp9_idct.h"  // for cospi constants
13#include "vpx_ports/mem.h"
14
15void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
16  // The 2D transform is done with two passes which are actually pretty
17  // similar. In the first one, we transform the columns and transpose
18  // the results. In the second one, we transform the rows. To achieve that,
19  // as the first pass results are transposed, we tranpose the columns (that
20  // is the transposed rows) and transpose the results (so that it goes back
21  // in normal/row positions).
22  const int stride = pitch >> 1;
23  int pass;
24  // Constants
25  //    When we use them, in one case, they are all the same. In all others
26  //    it's a pair of them that we need to repeat four times. This is done
27  //    by constructing the 32 bit constant corresponding to that pair.
28  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
29  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
30  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
31  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
32  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
33  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
34  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
35  const __m128i kOne = _mm_set1_epi16(1);
36  __m128i in0, in1, in2, in3;
37  // Load inputs.
38  {
39    in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
40    in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
41    in2  = _mm_loadl_epi64((const __m128i *)(input +  2 * stride));
42    in3  = _mm_loadl_epi64((const __m128i *)(input +  3 * stride));
43    // x = x << 4
44    in0 = _mm_slli_epi16(in0, 4);
45    in1 = _mm_slli_epi16(in1, 4);
46    in2 = _mm_slli_epi16(in2, 4);
47    in3 = _mm_slli_epi16(in3, 4);
48    // if (i == 0 && input[0]) input[0] += 1;
49    {
50      // The mask will only contain wether the first value is zero, all
51      // other comparison will fail as something shifted by 4 (above << 4)
52      // can never be equal to one. To increment in the non-zero case, we
53      // add the mask and one for the first element:
54      //   - if zero, mask = -1, v = v - 1 + 1 = v
55      //   - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
56      __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
57      in0 = _mm_add_epi16(in0, mask);
58      in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
59    }
60  }
61  // Do the two transform/transpose passes
62  for (pass = 0; pass < 2; ++pass) {
63    // Transform 1/2: Add/substract
64    const __m128i r0 = _mm_add_epi16(in0, in3);
65    const __m128i r1 = _mm_add_epi16(in1, in2);
66    const __m128i r2 = _mm_sub_epi16(in1, in2);
67    const __m128i r3 = _mm_sub_epi16(in0, in3);
68    // Transform 1/2: Interleave to do the multiply by constants which gets us
69    //                into 32 bits.
70    const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
71    const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
72    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
73    const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
74    const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
75    const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
76    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
77    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
78    const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
79    const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
80    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
81    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
82    const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
83    const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
84    // Combine and transpose
85    const __m128i res0 = _mm_packs_epi32(w0, w2);
86    const __m128i res1 = _mm_packs_epi32(w4, w6);
87    // 00 01 02 03 20 21 22 23
88    // 10 11 12 13 30 31 32 33
89    const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
90    const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1);
91    // 00 10 01 11 02 12 03 13
92    // 20 30 21 31 22 32 23 33
93    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
94    in2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
95    // 00 10 20 30 01 11 21 31      in0 contains 0 followed by 1
96    // 02 12 22 32 03 13 23 33      in2 contains 2 followed by 3
97    if (0 == pass) {
98      // Extract values in the high part for second pass as transform code
99      // only uses the first four values.
100      in1 = _mm_unpackhi_epi64(in0, in0);
101      in3 = _mm_unpackhi_epi64(in2, in2);
102    } else {
103      // Post-condition output and store it (v + 1) >> 2, taking advantage
104      // of the fact 1/3 are stored just after 0/2.
105      __m128i out01 = _mm_add_epi16(in0, kOne);
106      __m128i out23 = _mm_add_epi16(in2, kOne);
107      out01 = _mm_srai_epi16(out01, 2);
108      out23 = _mm_srai_epi16(out23, 2);
109      _mm_storeu_si128((__m128i *)(output + 0 * 4), out01);
110      _mm_storeu_si128((__m128i *)(output + 2 * 4), out23);
111    }
112  }
113}
114
115void vp9_short_fdct8x4_sse2(int16_t *input, int16_t *output, int pitch) {
116  vp9_short_fdct4x4_sse2(input, output, pitch);
117  vp9_short_fdct4x4_sse2(input + 4, output + 16, pitch);
118}
119
120static INLINE void load_buffer_4x4(int16_t *input, __m128i *in, int stride) {
121  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
122  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
123  __m128i mask;
124
125  in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
126  in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
127  in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
128  in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
129
130  in[0] = _mm_slli_epi16(in[0], 4);
131  in[1] = _mm_slli_epi16(in[1], 4);
132  in[2] = _mm_slli_epi16(in[2], 4);
133  in[3] = _mm_slli_epi16(in[3], 4);
134
135  mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
136  in[0] = _mm_add_epi16(in[0], mask);
137  in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
138}
139
140static INLINE void write_buffer_4x4(int16_t *output, __m128i *res) {
141  const __m128i kOne = _mm_set1_epi16(1);
142  __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
143  __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
144  __m128i out01 = _mm_add_epi16(in01, kOne);
145  __m128i out23 = _mm_add_epi16(in23, kOne);
146  out01 = _mm_srai_epi16(out01, 2);
147  out23 = _mm_srai_epi16(out23, 2);
148  _mm_store_si128((__m128i *)(output + 0 * 8), out01);
149  _mm_store_si128((__m128i *)(output + 1 * 8), out23);
150}
151
152static INLINE void transpose_4x4(__m128i *res) {
153  // Combine and transpose
154  // 00 01 02 03 20 21 22 23
155  // 10 11 12 13 30 31 32 33
156  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
157  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
158
159  // 00 10 01 11 02 12 03 13
160  // 20 30 21 31 22 32 23 33
161  res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
162  res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
163
164  // 00 10 20 30 01 11 21 31
165  // 02 12 22 32 03 13 23 33
166  // only use the first 4 16-bit integers
167  res[1] = _mm_unpackhi_epi64(res[0], res[0]);
168  res[3] = _mm_unpackhi_epi64(res[2], res[2]);
169}
170
171void fdct4_1d_sse2(__m128i *in) {
172  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
173  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
174  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
175  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
176  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
177
178  __m128i u[4], v[4];
179  u[0] = _mm_add_epi16(in[0], in[3]);
180  u[1] = _mm_add_epi16(in[1], in[2]);
181  u[2] = _mm_sub_epi16(in[1], in[2]);
182  u[3] = _mm_sub_epi16(in[0], in[3]);
183
184  v[0] = _mm_unpacklo_epi16(u[0], u[1]);
185  v[1] = _mm_unpacklo_epi16(u[2], u[3]);
186  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);  // 0
187  u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16);  // 2
188  u[2] = _mm_madd_epi16(v[1], k__cospi_p24_p08);  // 1
189  u[3] = _mm_madd_epi16(v[1], k__cospi_m08_p24);  // 3
190
191  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
192  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
193  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
194  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
195  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
196  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
197  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
198  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
199
200  in[0] = _mm_packs_epi32(u[0], u[1]);
201  in[1] = _mm_packs_epi32(u[2], u[3]);
202  transpose_4x4(in);
203}
204
205void fadst4_1d_sse2(__m128i *in) {
206  const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
207  const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
208  const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
209  const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
210  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
211  const __m128i kZero = _mm_set1_epi16(0);
212  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
213  __m128i u[8], v[8];
214  __m128i in7 = _mm_add_epi16(in[0], in[1]);
215  in7 = _mm_sub_epi16(in7, in[3]);
216
217  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
218  u[1] = _mm_unpacklo_epi16(in[2], in[3]);
219  u[2] = _mm_unpacklo_epi16(in7, kZero);
220  u[3] = _mm_unpacklo_epi16(in[2], kZero);
221
222  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02);  // s0 + s2
223  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04);  // s4 + s5
224  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x1
225  v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01);  // s1 - s3
226  v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02);  // -s4 + s6
227  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s4
228
229  u[0] = _mm_add_epi32(v[0], v[1]);
230  u[1] = v[2];
231  u[2] = _mm_add_epi32(v[3], v[4]);
232  u[3] = _mm_sub_epi32(u[2], u[0]);
233  u[4] = _mm_slli_epi32(v[5], 2);
234  u[5] = _mm_sub_epi32(u[4], v[5]);
235  u[6] = _mm_add_epi32(u[3], u[5]);
236
237  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
238  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
239  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
240  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
241
242  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
243  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
244  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
245  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
246
247  in[0] = _mm_packs_epi32(u[0], u[2]);
248  in[1] = _mm_packs_epi32(u[1], u[3]);
249  transpose_4x4(in);
250}
251
252void vp9_short_fht4x4_sse2(int16_t *input, int16_t *output,
253                           int stride, int tx_type) {
254  __m128i in[4];
255  load_buffer_4x4(input, in, stride);
256  switch (tx_type) {
257    case 0:  // DCT_DCT
258      fdct4_1d_sse2(in);
259      fdct4_1d_sse2(in);
260      break;
261    case 1:  // ADST_DCT
262      fadst4_1d_sse2(in);
263      fdct4_1d_sse2(in);
264      break;
265    case 2:  // DCT_ADST
266      fdct4_1d_sse2(in);
267      fadst4_1d_sse2(in);
268      break;
269    case 3:  // ADST_ADST
270      fadst4_1d_sse2(in);
271      fadst4_1d_sse2(in);
272      break;
273    default:
274      assert(0);
275      break;
276  }
277  write_buffer_4x4(output, in);
278}
279
280void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
281  const int stride = pitch >> 1;
282  int pass;
283  // Constants
284  //    When we use them, in one case, they are all the same. In all others
285  //    it's a pair of them that we need to repeat four times. This is done
286  //    by constructing the 32 bit constant corresponding to that pair.
287  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
288  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
289  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
290  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
291  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
292  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
293  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
294  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
295  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
296  // Load input
297  __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
298  __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
299  __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
300  __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
301  __m128i in4  = _mm_load_si128((const __m128i *)(input + 4 * stride));
302  __m128i in5  = _mm_load_si128((const __m128i *)(input + 5 * stride));
303  __m128i in6  = _mm_load_si128((const __m128i *)(input + 6 * stride));
304  __m128i in7  = _mm_load_si128((const __m128i *)(input + 7 * stride));
305  // Pre-condition input (shift by two)
306  in0 = _mm_slli_epi16(in0, 2);
307  in1 = _mm_slli_epi16(in1, 2);
308  in2 = _mm_slli_epi16(in2, 2);
309  in3 = _mm_slli_epi16(in3, 2);
310  in4 = _mm_slli_epi16(in4, 2);
311  in5 = _mm_slli_epi16(in5, 2);
312  in6 = _mm_slli_epi16(in6, 2);
313  in7 = _mm_slli_epi16(in7, 2);
314
315  // We do two passes, first the columns, then the rows. The results of the
316  // first pass are transposed so that the same column code can be reused. The
317  // results of the second pass are also transposed so that the rows (processed
318  // as columns) are put back in row positions.
319  for (pass = 0; pass < 2; pass++) {
320    // To store results of each pass before the transpose.
321    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
322    // Add/substract
323    const __m128i q0 = _mm_add_epi16(in0, in7);
324    const __m128i q1 = _mm_add_epi16(in1, in6);
325    const __m128i q2 = _mm_add_epi16(in2, in5);
326    const __m128i q3 = _mm_add_epi16(in3, in4);
327    const __m128i q4 = _mm_sub_epi16(in3, in4);
328    const __m128i q5 = _mm_sub_epi16(in2, in5);
329    const __m128i q6 = _mm_sub_epi16(in1, in6);
330    const __m128i q7 = _mm_sub_epi16(in0, in7);
331    // Work on first four results
332    {
333      // Add/substract
334      const __m128i r0 = _mm_add_epi16(q0, q3);
335      const __m128i r1 = _mm_add_epi16(q1, q2);
336      const __m128i r2 = _mm_sub_epi16(q1, q2);
337      const __m128i r3 = _mm_sub_epi16(q0, q3);
338      // Interleave to do the multiply by constants which gets us into 32bits
339      const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
340      const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
341      const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
342      const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
343      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
344      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
345      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
346      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
347      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
348      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
349      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
350      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
351      // dct_const_round_shift
352      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
353      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
354      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
355      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
356      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
357      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
358      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
359      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
360      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
361      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
362      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
363      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
364      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
365      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
366      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
367      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
368      // Combine
369      res0 = _mm_packs_epi32(w0, w1);
370      res4 = _mm_packs_epi32(w2, w3);
371      res2 = _mm_packs_epi32(w4, w5);
372      res6 = _mm_packs_epi32(w6, w7);
373    }
374    // Work on next four results
375    {
376      // Interleave to do the multiply by constants which gets us into 32bits
377      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
378      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
379      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
380      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
381      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
382      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
383      // dct_const_round_shift
384      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
385      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
386      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
387      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
388      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
389      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
390      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
391      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
392      // Combine
393      const __m128i r0 = _mm_packs_epi32(s0, s1);
394      const __m128i r1 = _mm_packs_epi32(s2, s3);
395      // Add/substract
396      const __m128i x0 = _mm_add_epi16(q4, r0);
397      const __m128i x1 = _mm_sub_epi16(q4, r0);
398      const __m128i x2 = _mm_sub_epi16(q7, r1);
399      const __m128i x3 = _mm_add_epi16(q7, r1);
400      // Interleave to do the multiply by constants which gets us into 32bits
401      const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
402      const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
403      const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
404      const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
405      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
406      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
407      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
408      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
409      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
410      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
411      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
412      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
413      // dct_const_round_shift
414      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
415      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
416      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
417      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
418      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
419      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
420      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
421      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
422      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
423      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
424      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
425      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
426      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
427      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
428      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
429      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
430      // Combine
431      res1 = _mm_packs_epi32(w0, w1);
432      res7 = _mm_packs_epi32(w2, w3);
433      res5 = _mm_packs_epi32(w4, w5);
434      res3 = _mm_packs_epi32(w6, w7);
435    }
436    // Transpose the 8x8.
437    {
438      // 00 01 02 03 04 05 06 07
439      // 10 11 12 13 14 15 16 17
440      // 20 21 22 23 24 25 26 27
441      // 30 31 32 33 34 35 36 37
442      // 40 41 42 43 44 45 46 47
443      // 50 51 52 53 54 55 56 57
444      // 60 61 62 63 64 65 66 67
445      // 70 71 72 73 74 75 76 77
446      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
447      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
448      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
449      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
450      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
451      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
452      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
453      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
454      // 00 10 01 11 02 12 03 13
455      // 20 30 21 31 22 32 23 33
456      // 04 14 05 15 06 16 07 17
457      // 24 34 25 35 26 36 27 37
458      // 40 50 41 51 42 52 43 53
459      // 60 70 61 71 62 72 63 73
460      // 54 54 55 55 56 56 57 57
461      // 64 74 65 75 66 76 67 77
462      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
463      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
464      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
465      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
466      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
467      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
468      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
469      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
470      // 00 10 20 30 01 11 21 31
471      // 40 50 60 70 41 51 61 71
472      // 02 12 22 32 03 13 23 33
473      // 42 52 62 72 43 53 63 73
474      // 04 14 24 34 05 15 21 36
475      // 44 54 64 74 45 55 61 76
476      // 06 16 26 36 07 17 27 37
477      // 46 56 66 76 47 57 67 77
478      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
479      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
480      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
481      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
482      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
483      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
484      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
485      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
486      // 00 10 20 30 40 50 60 70
487      // 01 11 21 31 41 51 61 71
488      // 02 12 22 32 42 52 62 72
489      // 03 13 23 33 43 53 63 73
490      // 04 14 24 34 44 54 64 74
491      // 05 15 25 35 45 55 65 75
492      // 06 16 26 36 46 56 66 76
493      // 07 17 27 37 47 57 67 77
494    }
495  }
496  // Post-condition output and store it
497  {
498    // Post-condition (division by two)
499    //    division of two 16 bits signed numbers using shifts
500    //    n / 2 = (n - (n >> 15)) >> 1
501    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
502    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
503    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
504    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
505    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
506    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
507    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
508    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
509    in0 = _mm_sub_epi16(in0, sign_in0);
510    in1 = _mm_sub_epi16(in1, sign_in1);
511    in2 = _mm_sub_epi16(in2, sign_in2);
512    in3 = _mm_sub_epi16(in3, sign_in3);
513    in4 = _mm_sub_epi16(in4, sign_in4);
514    in5 = _mm_sub_epi16(in5, sign_in5);
515    in6 = _mm_sub_epi16(in6, sign_in6);
516    in7 = _mm_sub_epi16(in7, sign_in7);
517    in0 = _mm_srai_epi16(in0, 1);
518    in1 = _mm_srai_epi16(in1, 1);
519    in2 = _mm_srai_epi16(in2, 1);
520    in3 = _mm_srai_epi16(in3, 1);
521    in4 = _mm_srai_epi16(in4, 1);
522    in5 = _mm_srai_epi16(in5, 1);
523    in6 = _mm_srai_epi16(in6, 1);
524    in7 = _mm_srai_epi16(in7, 1);
525    // store results
526    _mm_store_si128((__m128i *)(output + 0 * 8), in0);
527    _mm_store_si128((__m128i *)(output + 1 * 8), in1);
528    _mm_store_si128((__m128i *)(output + 2 * 8), in2);
529    _mm_store_si128((__m128i *)(output + 3 * 8), in3);
530    _mm_store_si128((__m128i *)(output + 4 * 8), in4);
531    _mm_store_si128((__m128i *)(output + 5 * 8), in5);
532    _mm_store_si128((__m128i *)(output + 6 * 8), in6);
533    _mm_store_si128((__m128i *)(output + 7 * 8), in7);
534  }
535}
536
537// load 8x8 array
538static INLINE void load_buffer_8x8(int16_t *input, __m128i *in, int stride) {
539  in[0]  = _mm_load_si128((__m128i *)(input + 0 * stride));
540  in[1]  = _mm_load_si128((__m128i *)(input + 1 * stride));
541  in[2]  = _mm_load_si128((__m128i *)(input + 2 * stride));
542  in[3]  = _mm_load_si128((__m128i *)(input + 3 * stride));
543  in[4]  = _mm_load_si128((__m128i *)(input + 4 * stride));
544  in[5]  = _mm_load_si128((__m128i *)(input + 5 * stride));
545  in[6]  = _mm_load_si128((__m128i *)(input + 6 * stride));
546  in[7]  = _mm_load_si128((__m128i *)(input + 7 * stride));
547
548  in[0] = _mm_slli_epi16(in[0], 2);
549  in[1] = _mm_slli_epi16(in[1], 2);
550  in[2] = _mm_slli_epi16(in[2], 2);
551  in[3] = _mm_slli_epi16(in[3], 2);
552  in[4] = _mm_slli_epi16(in[4], 2);
553  in[5] = _mm_slli_epi16(in[5], 2);
554  in[6] = _mm_slli_epi16(in[6], 2);
555  in[7] = _mm_slli_epi16(in[7], 2);
556}
557
558// right shift and rounding
559static INLINE void right_shift_8x8(__m128i *res, int const bit) {
560  const __m128i kOne = _mm_set1_epi16(1);
561  const int bit_m02 = bit - 2;
562  __m128i sign0 = _mm_srai_epi16(res[0], 15);
563  __m128i sign1 = _mm_srai_epi16(res[1], 15);
564  __m128i sign2 = _mm_srai_epi16(res[2], 15);
565  __m128i sign3 = _mm_srai_epi16(res[3], 15);
566  __m128i sign4 = _mm_srai_epi16(res[4], 15);
567  __m128i sign5 = _mm_srai_epi16(res[5], 15);
568  __m128i sign6 = _mm_srai_epi16(res[6], 15);
569  __m128i sign7 = _mm_srai_epi16(res[7], 15);
570
571  if (bit_m02 >= 0) {
572    __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02);
573    res[0] = _mm_add_epi16(res[0], k_const_rounding);
574    res[1] = _mm_add_epi16(res[1], k_const_rounding);
575    res[2] = _mm_add_epi16(res[2], k_const_rounding);
576    res[3] = _mm_add_epi16(res[3], k_const_rounding);
577    res[4] = _mm_add_epi16(res[4], k_const_rounding);
578    res[5] = _mm_add_epi16(res[5], k_const_rounding);
579    res[6] = _mm_add_epi16(res[6], k_const_rounding);
580    res[7] = _mm_add_epi16(res[7], k_const_rounding);
581  }
582
583  res[0] = _mm_sub_epi16(res[0], sign0);
584  res[1] = _mm_sub_epi16(res[1], sign1);
585  res[2] = _mm_sub_epi16(res[2], sign2);
586  res[3] = _mm_sub_epi16(res[3], sign3);
587  res[4] = _mm_sub_epi16(res[4], sign4);
588  res[5] = _mm_sub_epi16(res[5], sign5);
589  res[6] = _mm_sub_epi16(res[6], sign6);
590  res[7] = _mm_sub_epi16(res[7], sign7);
591
592  res[0] = _mm_srai_epi16(res[0], bit);
593  res[1] = _mm_srai_epi16(res[1], bit);
594  res[2] = _mm_srai_epi16(res[2], bit);
595  res[3] = _mm_srai_epi16(res[3], bit);
596  res[4] = _mm_srai_epi16(res[4], bit);
597  res[5] = _mm_srai_epi16(res[5], bit);
598  res[6] = _mm_srai_epi16(res[6], bit);
599  res[7] = _mm_srai_epi16(res[7], bit);
600}
601
602// write 8x8 array
603static INLINE void write_buffer_8x8(int16_t *output, __m128i *res, int stride) {
604  _mm_store_si128((__m128i *)(output + 0 * stride), res[0]);
605  _mm_store_si128((__m128i *)(output + 1 * stride), res[1]);
606  _mm_store_si128((__m128i *)(output + 2 * stride), res[2]);
607  _mm_store_si128((__m128i *)(output + 3 * stride), res[3]);
608  _mm_store_si128((__m128i *)(output + 4 * stride), res[4]);
609  _mm_store_si128((__m128i *)(output + 5 * stride), res[5]);
610  _mm_store_si128((__m128i *)(output + 6 * stride), res[6]);
611  _mm_store_si128((__m128i *)(output + 7 * stride), res[7]);
612}
613
614// perform in-place transpose
615static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
616  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
617  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
618  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
619  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
620  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
621  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
622  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
623  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
624  // 00 10 01 11 02 12 03 13
625  // 20 30 21 31 22 32 23 33
626  // 04 14 05 15 06 16 07 17
627  // 24 34 25 35 26 36 27 37
628  // 40 50 41 51 42 52 43 53
629  // 60 70 61 71 62 72 63 73
630  // 44 54 45 55 46 56 47 57
631  // 64 74 65 75 66 76 67 77
632  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
633  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
634  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
635  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
636  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
637  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
638  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
639  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
640  // 00 10 20 30 01 11 21 31
641  // 40 50 60 70 41 51 61 71
642  // 02 12 22 32 03 13 23 33
643  // 42 52 62 72 43 53 63 73
644  // 04 14 24 34 05 15 25 35
645  // 44 54 64 74 45 55 65 75
646  // 06 16 26 36 07 17 27 37
647  // 46 56 66 76 47 57 67 77
648  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
649  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
650  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
651  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
652  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
653  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
654  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
655  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
656  // 00 10 20 30 40 50 60 70
657  // 01 11 21 31 41 51 61 71
658  // 02 12 22 32 42 52 62 72
659  // 03 13 23 33 43 53 63 73
660  // 04 14 24 34 44 54 64 74
661  // 05 15 25 35 45 55 65 75
662  // 06 16 26 36 46 56 66 76
663  // 07 17 27 37 47 57 67 77
664}
665
666void fdct8_1d_sse2(__m128i *in) {
667  // constants
668  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
669  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
670  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
671  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
672  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
673  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
674  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
675  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
676  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
677  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
678  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
679  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
680
681  // stage 1
682  s0 = _mm_add_epi16(in[0], in[7]);
683  s1 = _mm_add_epi16(in[1], in[6]);
684  s2 = _mm_add_epi16(in[2], in[5]);
685  s3 = _mm_add_epi16(in[3], in[4]);
686  s4 = _mm_sub_epi16(in[3], in[4]);
687  s5 = _mm_sub_epi16(in[2], in[5]);
688  s6 = _mm_sub_epi16(in[1], in[6]);
689  s7 = _mm_sub_epi16(in[0], in[7]);
690
691  u0 = _mm_add_epi16(s0, s3);
692  u1 = _mm_add_epi16(s1, s2);
693  u2 = _mm_sub_epi16(s1, s2);
694  u3 = _mm_sub_epi16(s0, s3);
695  // interleave and perform butterfly multiplication/addition
696  v0 = _mm_unpacklo_epi16(u0, u1);
697  v1 = _mm_unpackhi_epi16(u0, u1);
698  v2 = _mm_unpacklo_epi16(u2, u3);
699  v3 = _mm_unpackhi_epi16(u2, u3);
700
701  u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
702  u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
703  u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
704  u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
705  u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
706  u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
707  u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
708  u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
709
710  // shift and rounding
711  v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
712  v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
713  v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
714  v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
715  v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
716  v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
717  v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
718  v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
719
720  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
721  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
722  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
723  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
724  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
725  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
726  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
727  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
728
729  in[0] = _mm_packs_epi32(u0, u1);
730  in[2] = _mm_packs_epi32(u4, u5);
731  in[4] = _mm_packs_epi32(u2, u3);
732  in[6] = _mm_packs_epi32(u6, u7);
733
734  // stage 2
735  // interleave and perform butterfly multiplication/addition
736  u0 = _mm_unpacklo_epi16(s6, s5);
737  u1 = _mm_unpackhi_epi16(s6, s5);
738  v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
739  v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
740  v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
741  v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
742
743  // shift and rounding
744  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
745  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
746  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
747  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
748
749  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
750  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
751  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
752  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
753
754  u0 = _mm_packs_epi32(v0, v1);
755  u1 = _mm_packs_epi32(v2, v3);
756
757  // stage 3
758  s0 = _mm_add_epi16(s4, u0);
759  s1 = _mm_sub_epi16(s4, u0);
760  s2 = _mm_sub_epi16(s7, u1);
761  s3 = _mm_add_epi16(s7, u1);
762
763  // stage 4
764  u0 = _mm_unpacklo_epi16(s0, s3);
765  u1 = _mm_unpackhi_epi16(s0, s3);
766  u2 = _mm_unpacklo_epi16(s1, s2);
767  u3 = _mm_unpackhi_epi16(s1, s2);
768
769  v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
770  v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
771  v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
772  v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
773  v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
774  v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
775  v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
776  v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
777
778  // shift and rounding
779  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
780  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
781  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
782  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
783  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
784  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
785  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
786  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
787
788  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
789  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
790  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
791  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
792  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
793  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
794  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
795  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
796
797  in[1] = _mm_packs_epi32(v0, v1);
798  in[3] = _mm_packs_epi32(v4, v5);
799  in[5] = _mm_packs_epi32(v2, v3);
800  in[7] = _mm_packs_epi32(v6, v7);
801
802  // transpose
803  array_transpose_8x8(in, in);
804}
805
806void fadst8_1d_sse2(__m128i *in) {
807  // Constants
808  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
809  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
810  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
811  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
812  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
813  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
814  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
815  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
816  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
817  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
818  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
819  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
820  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
821  const __m128i k__const_0 = _mm_set1_epi16(0);
822  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
823
824  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
825  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
826  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
827  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
828  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
829
830  // properly aligned for butterfly input
831  in0  = in[7];
832  in1  = in[0];
833  in2  = in[5];
834  in3  = in[2];
835  in4  = in[3];
836  in5  = in[4];
837  in6  = in[1];
838  in7  = in[6];
839
840  // column transformation
841  // stage 1
842  // interleave and multiply/add into 32-bit integer
843  s0 = _mm_unpacklo_epi16(in0, in1);
844  s1 = _mm_unpackhi_epi16(in0, in1);
845  s2 = _mm_unpacklo_epi16(in2, in3);
846  s3 = _mm_unpackhi_epi16(in2, in3);
847  s4 = _mm_unpacklo_epi16(in4, in5);
848  s5 = _mm_unpackhi_epi16(in4, in5);
849  s6 = _mm_unpacklo_epi16(in6, in7);
850  s7 = _mm_unpackhi_epi16(in6, in7);
851
852  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
853  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
854  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
855  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
856  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
857  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
858  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
859  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
860  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
861  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
862  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
863  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
864  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
865  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
866  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
867  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
868
869  // addition
870  w0 = _mm_add_epi32(u0, u8);
871  w1 = _mm_add_epi32(u1, u9);
872  w2 = _mm_add_epi32(u2, u10);
873  w3 = _mm_add_epi32(u3, u11);
874  w4 = _mm_add_epi32(u4, u12);
875  w5 = _mm_add_epi32(u5, u13);
876  w6 = _mm_add_epi32(u6, u14);
877  w7 = _mm_add_epi32(u7, u15);
878  w8 = _mm_sub_epi32(u0, u8);
879  w9 = _mm_sub_epi32(u1, u9);
880  w10 = _mm_sub_epi32(u2, u10);
881  w11 = _mm_sub_epi32(u3, u11);
882  w12 = _mm_sub_epi32(u4, u12);
883  w13 = _mm_sub_epi32(u5, u13);
884  w14 = _mm_sub_epi32(u6, u14);
885  w15 = _mm_sub_epi32(u7, u15);
886
887  // shift and rounding
888  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
889  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
890  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
891  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
892  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
893  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
894  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
895  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
896  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
897  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
898  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
899  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
900  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
901  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
902  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
903  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
904
905  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
906  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
907  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
908  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
909  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
910  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
911  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
912  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
913  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
914  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
915  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
916  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
917  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
918  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
919  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
920  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
921
922  // back to 16-bit and pack 8 integers into __m128i
923  in[0] = _mm_packs_epi32(u0, u1);
924  in[1] = _mm_packs_epi32(u2, u3);
925  in[2] = _mm_packs_epi32(u4, u5);
926  in[3] = _mm_packs_epi32(u6, u7);
927  in[4] = _mm_packs_epi32(u8, u9);
928  in[5] = _mm_packs_epi32(u10, u11);
929  in[6] = _mm_packs_epi32(u12, u13);
930  in[7] = _mm_packs_epi32(u14, u15);
931
932  // stage 2
933  s0 = _mm_add_epi16(in[0], in[2]);
934  s1 = _mm_add_epi16(in[1], in[3]);
935  s2 = _mm_sub_epi16(in[0], in[2]);
936  s3 = _mm_sub_epi16(in[1], in[3]);
937  u0 = _mm_unpacklo_epi16(in[4], in[5]);
938  u1 = _mm_unpackhi_epi16(in[4], in[5]);
939  u2 = _mm_unpacklo_epi16(in[6], in[7]);
940  u3 = _mm_unpackhi_epi16(in[6], in[7]);
941
942  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
943  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
944  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
945  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
946  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
947  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
948  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
949  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
950
951  w0 = _mm_add_epi32(v0, v4);
952  w1 = _mm_add_epi32(v1, v5);
953  w2 = _mm_add_epi32(v2, v6);
954  w3 = _mm_add_epi32(v3, v7);
955  w4 = _mm_sub_epi32(v0, v4);
956  w5 = _mm_sub_epi32(v1, v5);
957  w6 = _mm_sub_epi32(v2, v6);
958  w7 = _mm_sub_epi32(v3, v7);
959
960  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
961  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
962  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
963  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
964  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
965  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
966  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
967  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
968
969  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
970  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
971  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
972  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
973  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
974  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
975  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
976  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
977
978  // back to 16-bit intergers
979  s4 = _mm_packs_epi32(u0, u1);
980  s5 = _mm_packs_epi32(u2, u3);
981  s6 = _mm_packs_epi32(u4, u5);
982  s7 = _mm_packs_epi32(u6, u7);
983
984  // stage 3
985  u0 = _mm_unpacklo_epi16(s2, s3);
986  u1 = _mm_unpackhi_epi16(s2, s3);
987  u2 = _mm_unpacklo_epi16(s6, s7);
988  u3 = _mm_unpackhi_epi16(s6, s7);
989
990  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
991  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
992  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
993  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
994  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
995  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
996  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
997  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
998
999  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
1000  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
1001  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
1002  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
1003  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
1004  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
1005  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
1006  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
1007
1008  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
1009  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
1010  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
1011  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
1012  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
1013  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
1014  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
1015  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
1016
1017  s2 = _mm_packs_epi32(v0, v1);
1018  s3 = _mm_packs_epi32(v2, v3);
1019  s6 = _mm_packs_epi32(v4, v5);
1020  s7 = _mm_packs_epi32(v6, v7);
1021
1022  // FIXME(jingning): do subtract using bit inversion?
1023  in[0] = s0;
1024  in[1] = _mm_sub_epi16(k__const_0, s4);
1025  in[2] = s6;
1026  in[3] = _mm_sub_epi16(k__const_0, s2);
1027  in[4] = s3;
1028  in[5] = _mm_sub_epi16(k__const_0, s7);
1029  in[6] = s5;
1030  in[7] = _mm_sub_epi16(k__const_0, s1);
1031
1032  // transpose
1033  array_transpose_8x8(in, in);
1034}
1035
1036void vp9_short_fht8x8_sse2(int16_t *input, int16_t *output,
1037                           int stride, int tx_type) {
1038  __m128i in[8];
1039  load_buffer_8x8(input, in, stride);
1040  switch (tx_type) {
1041    case 0:  // DCT_DCT
1042      fdct8_1d_sse2(in);
1043      fdct8_1d_sse2(in);
1044      break;
1045    case 1:  // ADST_DCT
1046      fadst8_1d_sse2(in);
1047      fdct8_1d_sse2(in);
1048      break;
1049    case 2:  // DCT_ADST
1050      fdct8_1d_sse2(in);
1051      fadst8_1d_sse2(in);
1052      break;
1053    case 3:  // ADST_ADST
1054      fadst8_1d_sse2(in);
1055      fadst8_1d_sse2(in);
1056      break;
1057    default:
1058      assert(0);
1059      break;
1060  }
1061  right_shift_8x8(in, 1);
1062  write_buffer_8x8(output, in, 8);
1063}
1064
1065void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
1066  // The 2D transform is done with two passes which are actually pretty
1067  // similar. In the first one, we transform the columns and transpose
1068  // the results. In the second one, we transform the rows. To achieve that,
1069  // as the first pass results are transposed, we tranpose the columns (that
1070  // is the transposed rows) and transpose the results (so that it goes back
1071  // in normal/row positions).
1072  const int stride = pitch >> 1;
1073  int pass;
1074  // We need an intermediate buffer between passes.
1075  DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);
1076  int16_t *in = input;
1077  int16_t *out = intermediate;
1078  // Constants
1079  //    When we use them, in one case, they are all the same. In all others
1080  //    it's a pair of them that we need to repeat four times. This is done
1081  //    by constructing the 32 bit constant corresponding to that pair.
1082  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
1083  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1084  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
1085  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1086  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1087  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
1088  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
1089  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
1090  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
1091  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
1092  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
1093  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
1094  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
1095  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
1096  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
1097  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
1098  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
1099  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1100  const __m128i kOne = _mm_set1_epi16(1);
1101  // Do the two transform/transpose passes
1102  for (pass = 0; pass < 2; ++pass) {
1103    // We process eight columns (transposed rows in second pass) at a time.
1104    int column_start;
1105    for (column_start = 0; column_start < 16; column_start += 8) {
1106      __m128i in00, in01, in02, in03, in04, in05, in06, in07;
1107      __m128i in08, in09, in10, in11, in12, in13, in14, in15;
1108      __m128i input0, input1, input2, input3, input4, input5, input6, input7;
1109      __m128i step1_0, step1_1, step1_2, step1_3;
1110      __m128i step1_4, step1_5, step1_6, step1_7;
1111      __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
1112      __m128i step3_0, step3_1, step3_2, step3_3;
1113      __m128i step3_4, step3_5, step3_6, step3_7;
1114      __m128i res00, res01, res02, res03, res04, res05, res06, res07;
1115      __m128i res08, res09, res10, res11, res12, res13, res14, res15;
1116      // Load and pre-condition input.
1117      if (0 == pass) {
1118        in00  = _mm_load_si128((const __m128i *)(in +  0 * stride));
1119        in01  = _mm_load_si128((const __m128i *)(in +  1 * stride));
1120        in02  = _mm_load_si128((const __m128i *)(in +  2 * stride));
1121        in03  = _mm_load_si128((const __m128i *)(in +  3 * stride));
1122        in04  = _mm_load_si128((const __m128i *)(in +  4 * stride));
1123        in05  = _mm_load_si128((const __m128i *)(in +  5 * stride));
1124        in06  = _mm_load_si128((const __m128i *)(in +  6 * stride));
1125        in07  = _mm_load_si128((const __m128i *)(in +  7 * stride));
1126        in08  = _mm_load_si128((const __m128i *)(in +  8 * stride));
1127        in09  = _mm_load_si128((const __m128i *)(in +  9 * stride));
1128        in10  = _mm_load_si128((const __m128i *)(in + 10 * stride));
1129        in11  = _mm_load_si128((const __m128i *)(in + 11 * stride));
1130        in12  = _mm_load_si128((const __m128i *)(in + 12 * stride));
1131        in13  = _mm_load_si128((const __m128i *)(in + 13 * stride));
1132        in14  = _mm_load_si128((const __m128i *)(in + 14 * stride));
1133        in15  = _mm_load_si128((const __m128i *)(in + 15 * stride));
1134        // x = x << 2
1135        in00 = _mm_slli_epi16(in00, 2);
1136        in01 = _mm_slli_epi16(in01, 2);
1137        in02 = _mm_slli_epi16(in02, 2);
1138        in03 = _mm_slli_epi16(in03, 2);
1139        in04 = _mm_slli_epi16(in04, 2);
1140        in05 = _mm_slli_epi16(in05, 2);
1141        in06 = _mm_slli_epi16(in06, 2);
1142        in07 = _mm_slli_epi16(in07, 2);
1143        in08 = _mm_slli_epi16(in08, 2);
1144        in09 = _mm_slli_epi16(in09, 2);
1145        in10 = _mm_slli_epi16(in10, 2);
1146        in11 = _mm_slli_epi16(in11, 2);
1147        in12 = _mm_slli_epi16(in12, 2);
1148        in13 = _mm_slli_epi16(in13, 2);
1149        in14 = _mm_slli_epi16(in14, 2);
1150        in15 = _mm_slli_epi16(in15, 2);
1151      } else {
1152        in00  = _mm_load_si128((const __m128i *)(in +  0 * 16));
1153        in01  = _mm_load_si128((const __m128i *)(in +  1 * 16));
1154        in02  = _mm_load_si128((const __m128i *)(in +  2 * 16));
1155        in03  = _mm_load_si128((const __m128i *)(in +  3 * 16));
1156        in04  = _mm_load_si128((const __m128i *)(in +  4 * 16));
1157        in05  = _mm_load_si128((const __m128i *)(in +  5 * 16));
1158        in06  = _mm_load_si128((const __m128i *)(in +  6 * 16));
1159        in07  = _mm_load_si128((const __m128i *)(in +  7 * 16));
1160        in08  = _mm_load_si128((const __m128i *)(in +  8 * 16));
1161        in09  = _mm_load_si128((const __m128i *)(in +  9 * 16));
1162        in10  = _mm_load_si128((const __m128i *)(in + 10 * 16));
1163        in11  = _mm_load_si128((const __m128i *)(in + 11 * 16));
1164        in12  = _mm_load_si128((const __m128i *)(in + 12 * 16));
1165        in13  = _mm_load_si128((const __m128i *)(in + 13 * 16));
1166        in14  = _mm_load_si128((const __m128i *)(in + 14 * 16));
1167        in15  = _mm_load_si128((const __m128i *)(in + 15 * 16));
1168        // x = (x + 1) >> 2
1169        in00 = _mm_add_epi16(in00, kOne);
1170        in01 = _mm_add_epi16(in01, kOne);
1171        in02 = _mm_add_epi16(in02, kOne);
1172        in03 = _mm_add_epi16(in03, kOne);
1173        in04 = _mm_add_epi16(in04, kOne);
1174        in05 = _mm_add_epi16(in05, kOne);
1175        in06 = _mm_add_epi16(in06, kOne);
1176        in07 = _mm_add_epi16(in07, kOne);
1177        in08 = _mm_add_epi16(in08, kOne);
1178        in09 = _mm_add_epi16(in09, kOne);
1179        in10 = _mm_add_epi16(in10, kOne);
1180        in11 = _mm_add_epi16(in11, kOne);
1181        in12 = _mm_add_epi16(in12, kOne);
1182        in13 = _mm_add_epi16(in13, kOne);
1183        in14 = _mm_add_epi16(in14, kOne);
1184        in15 = _mm_add_epi16(in15, kOne);
1185        in00 = _mm_srai_epi16(in00, 2);
1186        in01 = _mm_srai_epi16(in01, 2);
1187        in02 = _mm_srai_epi16(in02, 2);
1188        in03 = _mm_srai_epi16(in03, 2);
1189        in04 = _mm_srai_epi16(in04, 2);
1190        in05 = _mm_srai_epi16(in05, 2);
1191        in06 = _mm_srai_epi16(in06, 2);
1192        in07 = _mm_srai_epi16(in07, 2);
1193        in08 = _mm_srai_epi16(in08, 2);
1194        in09 = _mm_srai_epi16(in09, 2);
1195        in10 = _mm_srai_epi16(in10, 2);
1196        in11 = _mm_srai_epi16(in11, 2);
1197        in12 = _mm_srai_epi16(in12, 2);
1198        in13 = _mm_srai_epi16(in13, 2);
1199        in14 = _mm_srai_epi16(in14, 2);
1200        in15 = _mm_srai_epi16(in15, 2);
1201      }
1202      in += 8;
1203      // Calculate input for the first 8 results.
1204      {
1205        input0 = _mm_add_epi16(in00, in15);
1206        input1 = _mm_add_epi16(in01, in14);
1207        input2 = _mm_add_epi16(in02, in13);
1208        input3 = _mm_add_epi16(in03, in12);
1209        input4 = _mm_add_epi16(in04, in11);
1210        input5 = _mm_add_epi16(in05, in10);
1211        input6 = _mm_add_epi16(in06, in09);
1212        input7 = _mm_add_epi16(in07, in08);
1213      }
1214      // Calculate input for the next 8 results.
1215      {
1216        step1_0 = _mm_sub_epi16(in07, in08);
1217        step1_1 = _mm_sub_epi16(in06, in09);
1218        step1_2 = _mm_sub_epi16(in05, in10);
1219        step1_3 = _mm_sub_epi16(in04, in11);
1220        step1_4 = _mm_sub_epi16(in03, in12);
1221        step1_5 = _mm_sub_epi16(in02, in13);
1222        step1_6 = _mm_sub_epi16(in01, in14);
1223        step1_7 = _mm_sub_epi16(in00, in15);
1224      }
1225      // Work on the first eight values; fdct8_1d(input, even_results);
1226      {
1227        // Add/substract
1228        const __m128i q0 = _mm_add_epi16(input0, input7);
1229        const __m128i q1 = _mm_add_epi16(input1, input6);
1230        const __m128i q2 = _mm_add_epi16(input2, input5);
1231        const __m128i q3 = _mm_add_epi16(input3, input4);
1232        const __m128i q4 = _mm_sub_epi16(input3, input4);
1233        const __m128i q5 = _mm_sub_epi16(input2, input5);
1234        const __m128i q6 = _mm_sub_epi16(input1, input6);
1235        const __m128i q7 = _mm_sub_epi16(input0, input7);
1236        // Work on first four results
1237        {
1238          // Add/substract
1239          const __m128i r0 = _mm_add_epi16(q0, q3);
1240          const __m128i r1 = _mm_add_epi16(q1, q2);
1241          const __m128i r2 = _mm_sub_epi16(q1, q2);
1242          const __m128i r3 = _mm_sub_epi16(q0, q3);
1243          // Interleave to do the multiply by constants which gets us
1244          // into 32 bits.
1245          const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
1246          const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
1247          const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
1248          const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
1249          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
1250          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
1251          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
1252          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
1253          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
1254          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
1255          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
1256          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
1257          // dct_const_round_shift
1258          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
1259          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
1260          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
1261          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
1262          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
1263          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
1264          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
1265          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
1266          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
1267          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
1268          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
1269          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
1270          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
1271          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
1272          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
1273          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
1274          // Combine
1275          res00 = _mm_packs_epi32(w0, w1);
1276          res08 = _mm_packs_epi32(w2, w3);
1277          res04 = _mm_packs_epi32(w4, w5);
1278          res12 = _mm_packs_epi32(w6, w7);
1279        }
1280        // Work on next four results
1281        {
1282          // Interleave to do the multiply by constants which gets us
1283          // into 32 bits.
1284          const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
1285          const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
1286          const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
1287          const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
1288          const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
1289          const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
1290          // dct_const_round_shift
1291          const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
1292          const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
1293          const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
1294          const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
1295          const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
1296          const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
1297          const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
1298          const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
1299          // Combine
1300          const __m128i r0 = _mm_packs_epi32(s0, s1);
1301          const __m128i r1 = _mm_packs_epi32(s2, s3);
1302          // Add/substract
1303          const __m128i x0 = _mm_add_epi16(q4, r0);
1304          const __m128i x1 = _mm_sub_epi16(q4, r0);
1305          const __m128i x2 = _mm_sub_epi16(q7, r1);
1306          const __m128i x3 = _mm_add_epi16(q7, r1);
1307          // Interleave to do the multiply by constants which gets us
1308          // into 32 bits.
1309          const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
1310          const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
1311          const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
1312          const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
1313          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
1314          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
1315          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
1316          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
1317          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
1318          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
1319          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
1320          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
1321          // dct_const_round_shift
1322          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
1323          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
1324          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
1325          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
1326          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
1327          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
1328          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
1329          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
1330          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
1331          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
1332          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
1333          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
1334          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
1335          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
1336          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
1337          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
1338          // Combine
1339          res02 = _mm_packs_epi32(w0, w1);
1340          res14 = _mm_packs_epi32(w2, w3);
1341          res10 = _mm_packs_epi32(w4, w5);
1342          res06 = _mm_packs_epi32(w6, w7);
1343        }
1344      }
1345      // Work on the next eight values; step1 -> odd_results
1346      {
1347        // step 2
1348        {
1349          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
1350          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
1351          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
1352          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
1353          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16);
1354          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16);
1355          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16);
1356          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16);
1357          // dct_const_round_shift
1358          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
1359          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
1360          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
1361          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
1362          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
1363          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
1364          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
1365          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
1366          // Combine
1367          step2_2 = _mm_packs_epi32(w0, w1);
1368          step2_3 = _mm_packs_epi32(w2, w3);
1369        }
1370        {
1371          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
1372          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
1373          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
1374          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
1375          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
1376          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
1377          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16);
1378          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16);
1379          // dct_const_round_shift
1380          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
1381          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
1382          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
1383          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
1384          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
1385          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
1386          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
1387          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
1388          // Combine
1389          step2_5 = _mm_packs_epi32(w0, w1);
1390          step2_4 = _mm_packs_epi32(w2, w3);
1391        }
1392        // step 3
1393        {
1394          step3_0 = _mm_add_epi16(step1_0, step2_3);
1395          step3_1 = _mm_add_epi16(step1_1, step2_2);
1396          step3_2 = _mm_sub_epi16(step1_1, step2_2);
1397          step3_3 = _mm_sub_epi16(step1_0, step2_3);
1398          step3_4 = _mm_sub_epi16(step1_7, step2_4);
1399          step3_5 = _mm_sub_epi16(step1_6, step2_5);
1400          step3_6 = _mm_add_epi16(step1_6, step2_5);
1401          step3_7 = _mm_add_epi16(step1_7, step2_4);
1402        }
1403        // step 4
1404        {
1405          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
1406          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
1407          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
1408          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
1409          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24);
1410          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24);
1411          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08);
1412          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08);
1413          // dct_const_round_shift
1414          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
1415          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
1416          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
1417          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
1418          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
1419          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
1420          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
1421          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
1422          // Combine
1423          step2_1 = _mm_packs_epi32(w0, w1);
1424          step2_2 = _mm_packs_epi32(w2, w3);
1425        }
1426        {
1427          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
1428          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
1429          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
1430          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
1431          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08);
1432          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08);
1433          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24);
1434          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24);
1435          // dct_const_round_shift
1436          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
1437          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
1438          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
1439          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
1440          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
1441          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
1442          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
1443          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
1444          // Combine
1445          step2_6 = _mm_packs_epi32(w0, w1);
1446          step2_5 = _mm_packs_epi32(w2, w3);
1447        }
1448        // step 5
1449        {
1450          step1_0 = _mm_add_epi16(step3_0, step2_1);
1451          step1_1 = _mm_sub_epi16(step3_0, step2_1);
1452          step1_2 = _mm_sub_epi16(step3_3, step2_2);
1453          step1_3 = _mm_add_epi16(step3_3, step2_2);
1454          step1_4 = _mm_add_epi16(step3_4, step2_5);
1455          step1_5 = _mm_sub_epi16(step3_4, step2_5);
1456          step1_6 = _mm_sub_epi16(step3_7, step2_6);
1457          step1_7 = _mm_add_epi16(step3_7, step2_6);
1458        }
1459        // step 6
1460        {
1461          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
1462          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
1463          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
1464          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
1465          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02);
1466          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02);
1467          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18);
1468          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18);
1469          // dct_const_round_shift
1470          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
1471          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
1472          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
1473          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
1474          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
1475          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
1476          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
1477          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
1478          // Combine
1479          res01 = _mm_packs_epi32(w0, w1);
1480          res09 = _mm_packs_epi32(w2, w3);
1481        }
1482        {
1483          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
1484          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
1485          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
1486          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
1487          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10);
1488          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10);
1489          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26);
1490          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26);
1491          // dct_const_round_shift
1492          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
1493          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
1494          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
1495          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
1496          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
1497          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
1498          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
1499          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
1500          // Combine
1501          res05 = _mm_packs_epi32(w0, w1);
1502          res13 = _mm_packs_epi32(w2, w3);
1503        }
1504        {
1505          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
1506          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
1507          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
1508          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
1509          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22);
1510          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22);
1511          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06);
1512          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06);
1513          // dct_const_round_shift
1514          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
1515          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
1516          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
1517          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
1518          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
1519          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
1520          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
1521          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
1522          // Combine
1523          res11 = _mm_packs_epi32(w0, w1);
1524          res03 = _mm_packs_epi32(w2, w3);
1525        }
1526        {
1527          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
1528          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
1529          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
1530          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
1531          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30);
1532          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30);
1533          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14);
1534          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14);
1535          // dct_const_round_shift
1536          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
1537          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
1538          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
1539          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
1540          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
1541          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
1542          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
1543          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
1544          // Combine
1545          res15 = _mm_packs_epi32(w0, w1);
1546          res07 = _mm_packs_epi32(w2, w3);
1547        }
1548      }
1549      // Transpose the results, do it as two 8x8 transposes.
1550      {
1551        // 00 01 02 03 04 05 06 07
1552        // 10 11 12 13 14 15 16 17
1553        // 20 21 22 23 24 25 26 27
1554        // 30 31 32 33 34 35 36 37
1555        // 40 41 42 43 44 45 46 47
1556        // 50 51 52 53 54 55 56 57
1557        // 60 61 62 63 64 65 66 67
1558        // 70 71 72 73 74 75 76 77
1559        const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01);
1560        const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03);
1561        const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01);
1562        const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03);
1563        const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05);
1564        const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07);
1565        const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05);
1566        const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07);
1567        // 00 10 01 11 02 12 03 13
1568        // 20 30 21 31 22 32 23 33
1569        // 04 14 05 15 06 16 07 17
1570        // 24 34 25 35 26 36 27 37
1571        // 40 50 41 51 42 52 43 53
1572        // 60 70 61 71 62 72 63 73
1573        // 54 54 55 55 56 56 57 57
1574        // 64 74 65 75 66 76 67 77
1575        const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
1576        const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
1577        const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
1578        const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
1579        const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
1580        const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
1581        const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
1582        const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
1583        // 00 10 20 30 01 11 21 31
1584        // 40 50 60 70 41 51 61 71
1585        // 02 12 22 32 03 13 23 33
1586        // 42 52 62 72 43 53 63 73
1587        // 04 14 24 34 05 15 21 36
1588        // 44 54 64 74 45 55 61 76
1589        // 06 16 26 36 07 17 27 37
1590        // 46 56 66 76 47 57 67 77
1591        const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
1592        const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
1593        const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
1594        const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
1595        const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
1596        const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
1597        const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
1598        const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
1599        // 00 10 20 30 40 50 60 70
1600        // 01 11 21 31 41 51 61 71
1601        // 02 12 22 32 42 52 62 72
1602        // 03 13 23 33 43 53 63 73
1603        // 04 14 24 34 44 54 64 74
1604        // 05 15 25 35 45 55 65 75
1605        // 06 16 26 36 46 56 66 76
1606        // 07 17 27 37 47 57 67 77
1607        _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0);
1608        _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1);
1609        _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2);
1610        _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3);
1611        _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4);
1612        _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5);
1613        _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6);
1614        _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7);
1615      }
1616      {
1617        // 00 01 02 03 04 05 06 07
1618        // 10 11 12 13 14 15 16 17
1619        // 20 21 22 23 24 25 26 27
1620        // 30 31 32 33 34 35 36 37
1621        // 40 41 42 43 44 45 46 47
1622        // 50 51 52 53 54 55 56 57
1623        // 60 61 62 63 64 65 66 67
1624        // 70 71 72 73 74 75 76 77
1625        const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09);
1626        const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11);
1627        const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09);
1628        const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11);
1629        const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13);
1630        const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15);
1631        const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13);
1632        const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15);
1633        // 00 10 01 11 02 12 03 13
1634        // 20 30 21 31 22 32 23 33
1635        // 04 14 05 15 06 16 07 17
1636        // 24 34 25 35 26 36 27 37
1637        // 40 50 41 51 42 52 43 53
1638        // 60 70 61 71 62 72 63 73
1639        // 54 54 55 55 56 56 57 57
1640        // 64 74 65 75 66 76 67 77
1641        const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
1642        const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
1643        const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
1644        const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
1645        const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
1646        const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
1647        const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
1648        const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
1649        // 00 10 20 30 01 11 21 31
1650        // 40 50 60 70 41 51 61 71
1651        // 02 12 22 32 03 13 23 33
1652        // 42 52 62 72 43 53 63 73
1653        // 04 14 24 34 05 15 21 36
1654        // 44 54 64 74 45 55 61 76
1655        // 06 16 26 36 07 17 27 37
1656        // 46 56 66 76 47 57 67 77
1657        const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
1658        const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
1659        const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
1660        const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
1661        const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
1662        const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
1663        const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
1664        const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
1665        // 00 10 20 30 40 50 60 70
1666        // 01 11 21 31 41 51 61 71
1667        // 02 12 22 32 42 52 62 72
1668        // 03 13 23 33 43 53 63 73
1669        // 04 14 24 34 44 54 64 74
1670        // 05 15 25 35 45 55 65 75
1671        // 06 16 26 36 46 56 66 76
1672        // 07 17 27 37 47 57 67 77
1673        // Store results
1674        _mm_store_si128((__m128i *)(out + 8 + 0 * 16), tr2_0);
1675        _mm_store_si128((__m128i *)(out + 8 + 1 * 16), tr2_1);
1676        _mm_store_si128((__m128i *)(out + 8 + 2 * 16), tr2_2);
1677        _mm_store_si128((__m128i *)(out + 8 + 3 * 16), tr2_3);
1678        _mm_store_si128((__m128i *)(out + 8 + 4 * 16), tr2_4);
1679        _mm_store_si128((__m128i *)(out + 8 + 5 * 16), tr2_5);
1680        _mm_store_si128((__m128i *)(out + 8 + 6 * 16), tr2_6);
1681        _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7);
1682      }
1683      out += 8*16;
1684    }
1685    // Setup in/out for next pass.
1686    in = intermediate;
1687    out = output;
1688  }
1689}
1690
1691static INLINE void load_buffer_16x16(int16_t* input, __m128i *in0,
1692                                     __m128i *in1, int stride) {
1693  // load first 8 columns
1694  load_buffer_8x8(input, in0, stride);
1695  load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
1696
1697  input += 8;
1698  // load second 8 columns
1699  load_buffer_8x8(input, in1, stride);
1700  load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
1701}
1702
1703static INLINE void write_buffer_16x16(int16_t *output, __m128i *in0,
1704                                      __m128i *in1, int stride) {
1705  // write first 8 columns
1706  write_buffer_8x8(output, in0, stride);
1707  write_buffer_8x8(output + 8 * stride, in0 + 8, stride);
1708  // write second 8 columns
1709  output += 8;
1710  write_buffer_8x8(output, in1, stride);
1711  write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
1712}
1713
1714static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
1715  __m128i tbuf[8];
1716  array_transpose_8x8(res0, res0);
1717  array_transpose_8x8(res1, tbuf);
1718  array_transpose_8x8(res0 + 8, res1);
1719  array_transpose_8x8(res1 + 8, res1 + 8);
1720
1721  res0[8] = tbuf[0];
1722  res0[9] = tbuf[1];
1723  res0[10] = tbuf[2];
1724  res0[11] = tbuf[3];
1725  res0[12] = tbuf[4];
1726  res0[13] = tbuf[5];
1727  res0[14] = tbuf[6];
1728  res0[15] = tbuf[7];
1729}
1730
1731static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
1732  // perform rounding operations
1733  right_shift_8x8(res0, 2);
1734  right_shift_8x8(res0 + 8, 2);
1735  right_shift_8x8(res1, 2);
1736  right_shift_8x8(res1 + 8, 2);
1737}
1738
1739void fdct16_1d_8col(__m128i *in) {
1740  // perform 16x16 1-D DCT for 8 columns
1741  __m128i i[8], s[8], p[8], t[8], u[16], v[16];
1742  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
1743  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1744  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1745  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
1746  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1747  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1748  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
1749  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
1750  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
1751  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
1752  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
1753  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
1754  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
1755  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
1756  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
1757  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
1758  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
1759  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
1760  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1761
1762  // stage 1
1763  i[0] = _mm_add_epi16(in[0], in[15]);
1764  i[1] = _mm_add_epi16(in[1], in[14]);
1765  i[2] = _mm_add_epi16(in[2], in[13]);
1766  i[3] = _mm_add_epi16(in[3], in[12]);
1767  i[4] = _mm_add_epi16(in[4], in[11]);
1768  i[5] = _mm_add_epi16(in[5], in[10]);
1769  i[6] = _mm_add_epi16(in[6], in[9]);
1770  i[7] = _mm_add_epi16(in[7], in[8]);
1771
1772  s[0] = _mm_sub_epi16(in[7], in[8]);
1773  s[1] = _mm_sub_epi16(in[6], in[9]);
1774  s[2] = _mm_sub_epi16(in[5], in[10]);
1775  s[3] = _mm_sub_epi16(in[4], in[11]);
1776  s[4] = _mm_sub_epi16(in[3], in[12]);
1777  s[5] = _mm_sub_epi16(in[2], in[13]);
1778  s[6] = _mm_sub_epi16(in[1], in[14]);
1779  s[7] = _mm_sub_epi16(in[0], in[15]);
1780
1781  p[0] = _mm_add_epi16(i[0], i[7]);
1782  p[1] = _mm_add_epi16(i[1], i[6]);
1783  p[2] = _mm_add_epi16(i[2], i[5]);
1784  p[3] = _mm_add_epi16(i[3], i[4]);
1785  p[4] = _mm_sub_epi16(i[3], i[4]);
1786  p[5] = _mm_sub_epi16(i[2], i[5]);
1787  p[6] = _mm_sub_epi16(i[1], i[6]);
1788  p[7] = _mm_sub_epi16(i[0], i[7]);
1789
1790  u[0] = _mm_add_epi16(p[0], p[3]);
1791  u[1] = _mm_add_epi16(p[1], p[2]);
1792  u[2] = _mm_sub_epi16(p[1], p[2]);
1793  u[3] = _mm_sub_epi16(p[0], p[3]);
1794
1795  v[0] = _mm_unpacklo_epi16(u[0], u[1]);
1796  v[1] = _mm_unpackhi_epi16(u[0], u[1]);
1797  v[2] = _mm_unpacklo_epi16(u[2], u[3]);
1798  v[3] = _mm_unpackhi_epi16(u[2], u[3]);
1799
1800  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
1801  u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
1802  u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
1803  u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
1804  u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
1805  u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
1806  u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
1807  u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
1808
1809  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1810  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1811  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1812  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1813  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1814  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1815  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1816  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1817
1818  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1819  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1820  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1821  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1822  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1823  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1824  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1825  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1826
1827  in[0] = _mm_packs_epi32(u[0], u[1]);
1828  in[4] = _mm_packs_epi32(u[4], u[5]);
1829  in[8] = _mm_packs_epi32(u[2], u[3]);
1830  in[12] = _mm_packs_epi32(u[6], u[7]);
1831
1832  u[0] = _mm_unpacklo_epi16(p[5], p[6]);
1833  u[1] = _mm_unpackhi_epi16(p[5], p[6]);
1834  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
1835  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
1836  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
1837  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
1838
1839  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1840  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1841  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1842  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1843
1844  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1845  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1846  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1847  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1848
1849  u[0] = _mm_packs_epi32(v[0], v[1]);
1850  u[1] = _mm_packs_epi32(v[2], v[3]);
1851
1852  t[0] = _mm_add_epi16(p[4], u[0]);
1853  t[1] = _mm_sub_epi16(p[4], u[0]);
1854  t[2] = _mm_sub_epi16(p[7], u[1]);
1855  t[3] = _mm_add_epi16(p[7], u[1]);
1856
1857  u[0] = _mm_unpacklo_epi16(t[0], t[3]);
1858  u[1] = _mm_unpackhi_epi16(t[0], t[3]);
1859  u[2] = _mm_unpacklo_epi16(t[1], t[2]);
1860  u[3] = _mm_unpackhi_epi16(t[1], t[2]);
1861
1862  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
1863  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
1864  v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
1865  v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
1866  v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
1867  v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
1868  v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
1869  v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
1870
1871  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1872  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1873  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1874  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1875  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1876  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1877  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1878  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1879
1880  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1881  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1882  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1883  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1884  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1885  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1886  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1887  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1888
1889  in[2] = _mm_packs_epi32(v[0], v[1]);
1890  in[6] = _mm_packs_epi32(v[4], v[5]);
1891  in[10] = _mm_packs_epi32(v[2], v[3]);
1892  in[14] = _mm_packs_epi32(v[6], v[7]);
1893
1894  // stage 2
1895  u[0] = _mm_unpacklo_epi16(s[2], s[5]);
1896  u[1] = _mm_unpackhi_epi16(s[2], s[5]);
1897  u[2] = _mm_unpacklo_epi16(s[3], s[4]);
1898  u[3] = _mm_unpackhi_epi16(s[3], s[4]);
1899
1900  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
1901  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
1902  v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
1903  v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
1904  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
1905  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
1906  v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
1907  v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
1908
1909  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1910  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1911  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1912  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1913  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1914  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1915  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1916  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1917
1918  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1919  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1920  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1921  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1922  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1923  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1924  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1925  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1926
1927  t[2] = _mm_packs_epi32(v[0], v[1]);
1928  t[3] = _mm_packs_epi32(v[2], v[3]);
1929  t[4] = _mm_packs_epi32(v[4], v[5]);
1930  t[5] = _mm_packs_epi32(v[6], v[7]);
1931
1932  // stage 3
1933  p[0] = _mm_add_epi16(s[0], t[3]);
1934  p[1] = _mm_add_epi16(s[1], t[2]);
1935  p[2] = _mm_sub_epi16(s[1], t[2]);
1936  p[3] = _mm_sub_epi16(s[0], t[3]);
1937  p[4] = _mm_sub_epi16(s[7], t[4]);
1938  p[5] = _mm_sub_epi16(s[6], t[5]);
1939  p[6] = _mm_add_epi16(s[6], t[5]);
1940  p[7] = _mm_add_epi16(s[7], t[4]);
1941
1942  // stage 4
1943  u[0] = _mm_unpacklo_epi16(p[1], p[6]);
1944  u[1] = _mm_unpackhi_epi16(p[1], p[6]);
1945  u[2] = _mm_unpacklo_epi16(p[2], p[5]);
1946  u[3] = _mm_unpackhi_epi16(p[2], p[5]);
1947
1948  v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
1949  v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
1950  v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08);
1951  v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08);
1952  v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24);
1953  v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24);
1954  v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
1955  v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
1956
1957  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1958  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1959  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1960  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1961  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1962  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1963  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1964  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1965
1966  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1967  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1968  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1969  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1970  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1971  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1972  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1973  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1974
1975  t[1] = _mm_packs_epi32(v[0], v[1]);
1976  t[2] = _mm_packs_epi32(v[2], v[3]);
1977  t[5] = _mm_packs_epi32(v[4], v[5]);
1978  t[6] = _mm_packs_epi32(v[6], v[7]);
1979
1980  // stage 5
1981  s[0] = _mm_add_epi16(p[0], t[1]);
1982  s[1] = _mm_sub_epi16(p[0], t[1]);
1983  s[2] = _mm_sub_epi16(p[3], t[2]);
1984  s[3] = _mm_add_epi16(p[3], t[2]);
1985  s[4] = _mm_add_epi16(p[4], t[5]);
1986  s[5] = _mm_sub_epi16(p[4], t[5]);
1987  s[6] = _mm_sub_epi16(p[7], t[6]);
1988  s[7] = _mm_add_epi16(p[7], t[6]);
1989
1990  // stage 6
1991  u[0] = _mm_unpacklo_epi16(s[0], s[7]);
1992  u[1] = _mm_unpackhi_epi16(s[0], s[7]);
1993  u[2] = _mm_unpacklo_epi16(s[1], s[6]);
1994  u[3] = _mm_unpackhi_epi16(s[1], s[6]);
1995  u[4] = _mm_unpacklo_epi16(s[2], s[5]);
1996  u[5] = _mm_unpackhi_epi16(s[2], s[5]);
1997  u[6] = _mm_unpacklo_epi16(s[3], s[4]);
1998  u[7] = _mm_unpackhi_epi16(s[3], s[4]);
1999
2000  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
2001  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
2002  v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
2003  v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
2004  v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
2005  v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
2006  v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
2007  v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
2008  v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
2009  v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
2010  v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
2011  v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
2012  v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
2013  v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
2014  v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
2015  v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
2016
2017  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2018  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2019  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2020  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2021  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2022  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2023  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2024  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2025  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
2026  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
2027  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
2028  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
2029  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
2030  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
2031  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
2032  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
2033
2034  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2035  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2036  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2037  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2038  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2039  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2040  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2041  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2042  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
2043  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
2044  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
2045  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
2046  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
2047  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
2048  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
2049  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
2050
2051  in[1]  = _mm_packs_epi32(v[0], v[1]);
2052  in[9]  = _mm_packs_epi32(v[2], v[3]);
2053  in[5]  = _mm_packs_epi32(v[4], v[5]);
2054  in[13] = _mm_packs_epi32(v[6], v[7]);
2055  in[3]  = _mm_packs_epi32(v[8], v[9]);
2056  in[11] = _mm_packs_epi32(v[10], v[11]);
2057  in[7]  = _mm_packs_epi32(v[12], v[13]);
2058  in[15] = _mm_packs_epi32(v[14], v[15]);
2059}
2060
2061void fadst16_1d_8col(__m128i *in) {
2062  // perform 16x16 1-D ADST for 8 columns
2063  __m128i s[16], x[16], u[32], v[32];
2064  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
2065  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
2066  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
2067  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
2068  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
2069  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
2070  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
2071  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
2072  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
2073  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
2074  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
2075  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
2076  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
2077  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
2078  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
2079  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
2080  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
2081  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2082  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
2083  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
2084  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
2085  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
2086  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
2087  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
2088  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
2089  const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
2090  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
2091  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2092  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
2093  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
2094  const __m128i kZero = _mm_set1_epi16(0);
2095
2096  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
2097  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
2098  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
2099  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
2100  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
2101  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
2102  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
2103  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
2104  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
2105  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
2106  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
2107  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
2108  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
2109  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
2110  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
2111  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
2112
2113  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
2114  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
2115  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
2116  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
2117  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
2118  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
2119  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
2120  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
2121  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
2122  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
2123  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
2124  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
2125  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
2126  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
2127  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
2128  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
2129  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
2130  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
2131  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
2132  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
2133  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
2134  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
2135  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
2136  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
2137  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
2138  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
2139  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
2140  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
2141  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
2142  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
2143  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
2144  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
2145
2146  u[0] = _mm_add_epi32(v[0], v[16]);
2147  u[1] = _mm_add_epi32(v[1], v[17]);
2148  u[2] = _mm_add_epi32(v[2], v[18]);
2149  u[3] = _mm_add_epi32(v[3], v[19]);
2150  u[4] = _mm_add_epi32(v[4], v[20]);
2151  u[5] = _mm_add_epi32(v[5], v[21]);
2152  u[6] = _mm_add_epi32(v[6], v[22]);
2153  u[7] = _mm_add_epi32(v[7], v[23]);
2154  u[8] = _mm_add_epi32(v[8], v[24]);
2155  u[9] = _mm_add_epi32(v[9], v[25]);
2156  u[10] = _mm_add_epi32(v[10], v[26]);
2157  u[11] = _mm_add_epi32(v[11], v[27]);
2158  u[12] = _mm_add_epi32(v[12], v[28]);
2159  u[13] = _mm_add_epi32(v[13], v[29]);
2160  u[14] = _mm_add_epi32(v[14], v[30]);
2161  u[15] = _mm_add_epi32(v[15], v[31]);
2162  u[16] = _mm_sub_epi32(v[0], v[16]);
2163  u[17] = _mm_sub_epi32(v[1], v[17]);
2164  u[18] = _mm_sub_epi32(v[2], v[18]);
2165  u[19] = _mm_sub_epi32(v[3], v[19]);
2166  u[20] = _mm_sub_epi32(v[4], v[20]);
2167  u[21] = _mm_sub_epi32(v[5], v[21]);
2168  u[22] = _mm_sub_epi32(v[6], v[22]);
2169  u[23] = _mm_sub_epi32(v[7], v[23]);
2170  u[24] = _mm_sub_epi32(v[8], v[24]);
2171  u[25] = _mm_sub_epi32(v[9], v[25]);
2172  u[26] = _mm_sub_epi32(v[10], v[26]);
2173  u[27] = _mm_sub_epi32(v[11], v[27]);
2174  u[28] = _mm_sub_epi32(v[12], v[28]);
2175  u[29] = _mm_sub_epi32(v[13], v[29]);
2176  u[30] = _mm_sub_epi32(v[14], v[30]);
2177  u[31] = _mm_sub_epi32(v[15], v[31]);
2178
2179  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
2180  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
2181  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
2182  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
2183  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
2184  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
2185  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
2186  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
2187  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
2188  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
2189  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
2190  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
2191  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
2192  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
2193  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
2194  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
2195  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
2196  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
2197  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
2198  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
2199  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
2200  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
2201  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
2202  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
2203  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
2204  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
2205  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
2206  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
2207  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
2208  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
2209  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
2210  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
2211
2212  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
2213  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
2214  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
2215  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
2216  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
2217  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
2218  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
2219  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
2220  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
2221  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
2222  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
2223  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
2224  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
2225  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
2226  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
2227  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
2228  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
2229  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
2230  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
2231  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
2232  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
2233  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
2234  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
2235  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
2236  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
2237  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
2238  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
2239  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
2240  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
2241  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
2242  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
2243  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
2244
2245  s[0] = _mm_packs_epi32(u[0], u[1]);
2246  s[1] = _mm_packs_epi32(u[2], u[3]);
2247  s[2] = _mm_packs_epi32(u[4], u[5]);
2248  s[3] = _mm_packs_epi32(u[6], u[7]);
2249  s[4] = _mm_packs_epi32(u[8], u[9]);
2250  s[5] = _mm_packs_epi32(u[10], u[11]);
2251  s[6] = _mm_packs_epi32(u[12], u[13]);
2252  s[7] = _mm_packs_epi32(u[14], u[15]);
2253  s[8] = _mm_packs_epi32(u[16], u[17]);
2254  s[9] = _mm_packs_epi32(u[18], u[19]);
2255  s[10] = _mm_packs_epi32(u[20], u[21]);
2256  s[11] = _mm_packs_epi32(u[22], u[23]);
2257  s[12] = _mm_packs_epi32(u[24], u[25]);
2258  s[13] = _mm_packs_epi32(u[26], u[27]);
2259  s[14] = _mm_packs_epi32(u[28], u[29]);
2260  s[15] = _mm_packs_epi32(u[30], u[31]);
2261
2262  // stage 2
2263  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
2264  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
2265  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
2266  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
2267  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
2268  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
2269  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
2270  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
2271
2272  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
2273  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
2274  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
2275  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
2276  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
2277  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
2278  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
2279  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
2280  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
2281  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
2282  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
2283  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
2284  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
2285  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
2286  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
2287  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
2288
2289  u[0] = _mm_add_epi32(v[0], v[8]);
2290  u[1] = _mm_add_epi32(v[1], v[9]);
2291  u[2] = _mm_add_epi32(v[2], v[10]);
2292  u[3] = _mm_add_epi32(v[3], v[11]);
2293  u[4] = _mm_add_epi32(v[4], v[12]);
2294  u[5] = _mm_add_epi32(v[5], v[13]);
2295  u[6] = _mm_add_epi32(v[6], v[14]);
2296  u[7] = _mm_add_epi32(v[7], v[15]);
2297  u[8] = _mm_sub_epi32(v[0], v[8]);
2298  u[9] = _mm_sub_epi32(v[1], v[9]);
2299  u[10] = _mm_sub_epi32(v[2], v[10]);
2300  u[11] = _mm_sub_epi32(v[3], v[11]);
2301  u[12] = _mm_sub_epi32(v[4], v[12]);
2302  u[13] = _mm_sub_epi32(v[5], v[13]);
2303  u[14] = _mm_sub_epi32(v[6], v[14]);
2304  u[15] = _mm_sub_epi32(v[7], v[15]);
2305
2306  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
2307  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
2308  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
2309  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
2310  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
2311  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
2312  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
2313  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
2314  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
2315  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
2316  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
2317  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
2318  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
2319  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
2320  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
2321  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
2322
2323  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
2324  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
2325  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
2326  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
2327  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
2328  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
2329  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
2330  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
2331  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
2332  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
2333  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
2334  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
2335  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
2336  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
2337  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
2338  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
2339
2340  x[0] = _mm_add_epi16(s[0], s[4]);
2341  x[1] = _mm_add_epi16(s[1], s[5]);
2342  x[2] = _mm_add_epi16(s[2], s[6]);
2343  x[3] = _mm_add_epi16(s[3], s[7]);
2344  x[4] = _mm_sub_epi16(s[0], s[4]);
2345  x[5] = _mm_sub_epi16(s[1], s[5]);
2346  x[6] = _mm_sub_epi16(s[2], s[6]);
2347  x[7] = _mm_sub_epi16(s[3], s[7]);
2348  x[8] = _mm_packs_epi32(u[0], u[1]);
2349  x[9] = _mm_packs_epi32(u[2], u[3]);
2350  x[10] = _mm_packs_epi32(u[4], u[5]);
2351  x[11] = _mm_packs_epi32(u[6], u[7]);
2352  x[12] = _mm_packs_epi32(u[8], u[9]);
2353  x[13] = _mm_packs_epi32(u[10], u[11]);
2354  x[14] = _mm_packs_epi32(u[12], u[13]);
2355  x[15] = _mm_packs_epi32(u[14], u[15]);
2356
2357  // stage 3
2358  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
2359  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
2360  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
2361  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
2362  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
2363  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
2364  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
2365  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
2366
2367  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
2368  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
2369  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
2370  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
2371  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
2372  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
2373  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
2374  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
2375  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
2376  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
2377  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
2378  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
2379  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
2380  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
2381  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
2382  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
2383
2384  u[0] = _mm_add_epi32(v[0], v[4]);
2385  u[1] = _mm_add_epi32(v[1], v[5]);
2386  u[2] = _mm_add_epi32(v[2], v[6]);
2387  u[3] = _mm_add_epi32(v[3], v[7]);
2388  u[4] = _mm_sub_epi32(v[0], v[4]);
2389  u[5] = _mm_sub_epi32(v[1], v[5]);
2390  u[6] = _mm_sub_epi32(v[2], v[6]);
2391  u[7] = _mm_sub_epi32(v[3], v[7]);
2392  u[8] = _mm_add_epi32(v[8], v[12]);
2393  u[9] = _mm_add_epi32(v[9], v[13]);
2394  u[10] = _mm_add_epi32(v[10], v[14]);
2395  u[11] = _mm_add_epi32(v[11], v[15]);
2396  u[12] = _mm_sub_epi32(v[8], v[12]);
2397  u[13] = _mm_sub_epi32(v[9], v[13]);
2398  u[14] = _mm_sub_epi32(v[10], v[14]);
2399  u[15] = _mm_sub_epi32(v[11], v[15]);
2400
2401  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
2402  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
2403  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
2404  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
2405  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
2406  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
2407  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
2408  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
2409  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
2410  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
2411  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
2412  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
2413  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
2414  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
2415  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
2416  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
2417
2418  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2419  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2420  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2421  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2422  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2423  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2424  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2425  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2426  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
2427  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
2428  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
2429  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
2430  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
2431  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
2432  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
2433  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
2434
2435  s[0] = _mm_add_epi16(x[0], x[2]);
2436  s[1] = _mm_add_epi16(x[1], x[3]);
2437  s[2] = _mm_sub_epi16(x[0], x[2]);
2438  s[3] = _mm_sub_epi16(x[1], x[3]);
2439  s[4] = _mm_packs_epi32(v[0], v[1]);
2440  s[5] = _mm_packs_epi32(v[2], v[3]);
2441  s[6] = _mm_packs_epi32(v[4], v[5]);
2442  s[7] = _mm_packs_epi32(v[6], v[7]);
2443  s[8] = _mm_add_epi16(x[8], x[10]);
2444  s[9] = _mm_add_epi16(x[9], x[11]);
2445  s[10] = _mm_sub_epi16(x[8], x[10]);
2446  s[11] = _mm_sub_epi16(x[9], x[11]);
2447  s[12] = _mm_packs_epi32(v[8], v[9]);
2448  s[13] = _mm_packs_epi32(v[10], v[11]);
2449  s[14] = _mm_packs_epi32(v[12], v[13]);
2450  s[15] = _mm_packs_epi32(v[14], v[15]);
2451
2452  // stage 4
2453  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
2454  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
2455  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
2456  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
2457  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
2458  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
2459  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
2460  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
2461
2462  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
2463  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
2464  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
2465  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
2466  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
2467  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
2468  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
2469  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
2470  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
2471  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
2472  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
2473  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
2474  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
2475  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
2476  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
2477  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
2478
2479  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2480  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2481  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2482  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2483  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2484  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2485  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2486  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2487  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
2488  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
2489  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
2490  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
2491  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
2492  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
2493  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
2494  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
2495
2496  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2497  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2498  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2499  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2500  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2501  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2502  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2503  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2504  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
2505  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
2506  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
2507  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
2508  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
2509  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
2510  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
2511  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
2512
2513  in[0] = s[0];
2514  in[1] = _mm_sub_epi16(kZero, s[8]);
2515  in[2] = s[12];
2516  in[3] = _mm_sub_epi16(kZero, s[4]);
2517  in[4] = _mm_packs_epi32(v[4], v[5]);
2518  in[5] = _mm_packs_epi32(v[12], v[13]);
2519  in[6] = _mm_packs_epi32(v[8], v[9]);
2520  in[7] = _mm_packs_epi32(v[0], v[1]);
2521  in[8] = _mm_packs_epi32(v[2], v[3]);
2522  in[9] = _mm_packs_epi32(v[10], v[11]);
2523  in[10] = _mm_packs_epi32(v[14], v[15]);
2524  in[11] = _mm_packs_epi32(v[6], v[7]);
2525  in[12] = s[5];
2526  in[13] = _mm_sub_epi16(kZero, s[13]);
2527  in[14] = s[9];
2528  in[15] = _mm_sub_epi16(kZero, s[1]);
2529}
2530
2531void fdct16_1d_sse2(__m128i *in0, __m128i *in1) {
2532  fdct16_1d_8col(in0);
2533  fdct16_1d_8col(in1);
2534  array_transpose_16x16(in0, in1);
2535}
2536
2537void fadst16_1d_sse2(__m128i *in0, __m128i *in1) {
2538  fadst16_1d_8col(in0);
2539  fadst16_1d_8col(in1);
2540  array_transpose_16x16(in0, in1);
2541}
2542
2543void vp9_short_fht16x16_sse2(int16_t *input, int16_t *output,
2544                             int stride, int tx_type) {
2545  __m128i in0[16], in1[16];
2546  load_buffer_16x16(input, in0, in1, stride);
2547  switch (tx_type) {
2548    case 0:  // DCT_DCT
2549      fdct16_1d_sse2(in0, in1);
2550      right_shift_16x16(in0, in1);
2551      fdct16_1d_sse2(in0, in1);
2552      break;
2553    case 1:  // ADST_DCT
2554      fadst16_1d_sse2(in0, in1);
2555      right_shift_16x16(in0, in1);
2556      fdct16_1d_sse2(in0, in1);
2557      break;
2558    case 2:  // DCT_ADST
2559      fdct16_1d_sse2(in0, in1);
2560      right_shift_16x16(in0, in1);
2561      fadst16_1d_sse2(in0, in1);
2562      break;
2563    case 3:  // ADST_ADST
2564      fadst16_1d_sse2(in0, in1);
2565      right_shift_16x16(in0, in1);
2566      fadst16_1d_sse2(in0, in1);
2567      break;
2568    default:
2569      assert(0);
2570      break;
2571  }
2572  write_buffer_16x16(output, in0, in1, 16);
2573}
2574
2575void vp9_short_fdct32x32_rd_sse2(int16_t *input,
2576                                 int16_t *output_org, int pitch) {
2577  // Calculate pre-multiplied strides
2578  const int str1 = pitch >> 1;
2579  const int str2 = pitch;
2580  const int str3 = pitch + str1;
2581  // We need an intermediate buffer between passes.
2582  DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
2583  // Constants
2584  //    When we use them, in one case, they are all the same. In all others
2585  //    it's a pair of them that we need to repeat four times. This is done
2586  //    by constructing the 32 bit constant corresponding to that pair.
2587  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(+cospi_16_64);
2588  const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
2589  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64,   cospi_24_64);
2590  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
2591  const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64,  cospi_8_64);
2592  const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64,  cospi_20_64);
2593  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64,  cospi_12_64);
2594  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64,   cospi_28_64);
2595  const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64,  cospi_4_64);
2596  const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
2597  const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
2598  const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64,  cospi_2_64);
2599  const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64,  cospi_18_64);
2600  const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64,  cospi_10_64);
2601  const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64,   cospi_26_64);
2602  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64,  cospi_6_64);
2603  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64,  cospi_22_64);
2604  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64,  cospi_14_64);
2605  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64,   cospi_30_64);
2606  const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64,  cospi_1_64);
2607  const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64,  cospi_17_64);
2608  const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64,  cospi_9_64);
2609  const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64,   cospi_25_64);
2610  const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64,  cospi_7_64);
2611  const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64,   cospi_23_64);
2612  const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64,  cospi_15_64);
2613  const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64,   cospi_31_64);
2614  const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64,  cospi_5_64);
2615  const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64,  cospi_21_64);
2616  const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64,  cospi_13_64);
2617  const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64,   cospi_29_64);
2618  const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64,  cospi_3_64);
2619  const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64,  cospi_19_64);
2620  const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64,  cospi_11_64);
2621  const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64,   cospi_27_64);
2622  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
2623  const __m128i kZero = _mm_set1_epi16(0);
2624  const __m128i kOne  = _mm_set1_epi16(1);
2625  // Do the two transform/transpose passes
2626  int pass;
2627  for (pass = 0; pass < 2; ++pass) {
2628    // We process eight columns (transposed rows in second pass) at a time.
2629    int column_start;
2630    for (column_start = 0; column_start < 32; column_start += 8) {
2631      __m128i step1[32];
2632      __m128i step2[32];
2633      __m128i step3[32];
2634      __m128i out[32];
2635      // Stage 1
2636      // Note: even though all the loads below are aligned, using the aligned
2637      //       intrinsic make the code slightly slower.
2638      if (0 == pass) {
2639        int16_t *in  = &input[column_start];
2640        // step1[i] =  (in[ 0 * stride] + in[(32 -  1) * stride]) << 2;
2641        // Note: the next four blocks could be in a loop. That would help the
2642        //       instruction cache but is actually slower.
2643        {
2644          int16_t *ina =  in +  0 * str1;
2645          int16_t *inb =  in + 31 * str1;
2646          __m128i *step1a = &step1[ 0];
2647          __m128i *step1b = &step1[31];
2648          const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
2649          const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
2650          const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
2651          const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
2652          const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
2653          const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
2654          const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
2655          const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
2656          step1a[ 0] = _mm_add_epi16(ina0, inb0);
2657          step1a[ 1] = _mm_add_epi16(ina1, inb1);
2658          step1a[ 2] = _mm_add_epi16(ina2, inb2);
2659          step1a[ 3] = _mm_add_epi16(ina3, inb3);
2660          step1b[-3] = _mm_sub_epi16(ina3, inb3);
2661          step1b[-2] = _mm_sub_epi16(ina2, inb2);
2662          step1b[-1] = _mm_sub_epi16(ina1, inb1);
2663          step1b[-0] = _mm_sub_epi16(ina0, inb0);
2664          step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
2665          step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
2666          step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
2667          step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
2668          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
2669          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
2670          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
2671          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
2672        }
2673        {
2674          int16_t *ina =  in +  4 * str1;
2675          int16_t *inb =  in + 27 * str1;
2676          __m128i *step1a = &step1[ 4];
2677          __m128i *step1b = &step1[27];
2678          const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
2679          const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
2680          const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
2681          const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
2682          const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
2683          const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
2684          const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
2685          const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
2686          step1a[ 0] = _mm_add_epi16(ina0, inb0);
2687          step1a[ 1] = _mm_add_epi16(ina1, inb1);
2688          step1a[ 2] = _mm_add_epi16(ina2, inb2);
2689          step1a[ 3] = _mm_add_epi16(ina3, inb3);
2690          step1b[-3] = _mm_sub_epi16(ina3, inb3);
2691          step1b[-2] = _mm_sub_epi16(ina2, inb2);
2692          step1b[-1] = _mm_sub_epi16(ina1, inb1);
2693          step1b[-0] = _mm_sub_epi16(ina0, inb0);
2694          step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
2695          step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
2696          step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
2697          step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
2698          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
2699          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
2700          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
2701          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
2702        }
2703        {
2704          int16_t *ina =  in +  8 * str1;
2705          int16_t *inb =  in + 23 * str1;
2706          __m128i *step1a = &step1[ 8];
2707          __m128i *step1b = &step1[23];
2708          const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
2709          const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
2710          const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
2711          const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
2712          const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
2713          const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
2714          const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
2715          const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
2716          step1a[ 0] = _mm_add_epi16(ina0, inb0);
2717          step1a[ 1] = _mm_add_epi16(ina1, inb1);
2718          step1a[ 2] = _mm_add_epi16(ina2, inb2);
2719          step1a[ 3] = _mm_add_epi16(ina3, inb3);
2720          step1b[-3] = _mm_sub_epi16(ina3, inb3);
2721          step1b[-2] = _mm_sub_epi16(ina2, inb2);
2722          step1b[-1] = _mm_sub_epi16(ina1, inb1);
2723          step1b[-0] = _mm_sub_epi16(ina0, inb0);
2724          step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
2725          step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
2726          step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
2727          step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
2728          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
2729          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
2730          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
2731          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
2732        }
2733        {
2734          int16_t *ina =  in + 12 * str1;
2735          int16_t *inb =  in + 19 * str1;
2736          __m128i *step1a = &step1[12];
2737          __m128i *step1b = &step1[19];
2738          const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
2739          const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
2740          const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
2741          const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
2742          const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
2743          const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
2744          const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
2745          const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
2746          step1a[ 0] = _mm_add_epi16(ina0, inb0);
2747          step1a[ 1] = _mm_add_epi16(ina1, inb1);
2748          step1a[ 2] = _mm_add_epi16(ina2, inb2);
2749          step1a[ 3] = _mm_add_epi16(ina3, inb3);
2750          step1b[-3] = _mm_sub_epi16(ina3, inb3);
2751          step1b[-2] = _mm_sub_epi16(ina2, inb2);
2752          step1b[-1] = _mm_sub_epi16(ina1, inb1);
2753          step1b[-0] = _mm_sub_epi16(ina0, inb0);
2754          step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
2755          step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
2756          step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
2757          step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
2758          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
2759          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
2760          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
2761          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
2762        }
2763      } else {
2764        int16_t *in = &intermediate[column_start];
2765        // step1[i] =  in[ 0 * 32] + in[(32 -  1) * 32];
2766        // Note: using the same approach as above to have common offset is
2767        //       counter-productive as all offsets can be calculated at compile
2768        //       time.
2769        // Note: the next four blocks could be in a loop. That would help the
2770        //       instruction cache but is actually slower.
2771        {
2772          __m128i in00  = _mm_loadu_si128((const __m128i *)(in +  0 * 32));
2773          __m128i in01  = _mm_loadu_si128((const __m128i *)(in +  1 * 32));
2774          __m128i in02  = _mm_loadu_si128((const __m128i *)(in +  2 * 32));
2775          __m128i in03  = _mm_loadu_si128((const __m128i *)(in +  3 * 32));
2776          __m128i in28  = _mm_loadu_si128((const __m128i *)(in + 28 * 32));
2777          __m128i in29  = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
2778          __m128i in30  = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
2779          __m128i in31  = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
2780          step1[ 0] = _mm_add_epi16(in00, in31);
2781          step1[ 1] = _mm_add_epi16(in01, in30);
2782          step1[ 2] = _mm_add_epi16(in02, in29);
2783          step1[ 3] = _mm_add_epi16(in03, in28);
2784          step1[28] = _mm_sub_epi16(in03, in28);
2785          step1[29] = _mm_sub_epi16(in02, in29);
2786          step1[30] = _mm_sub_epi16(in01, in30);
2787          step1[31] = _mm_sub_epi16(in00, in31);
2788        }
2789        {
2790          __m128i in04  = _mm_loadu_si128((const __m128i *)(in +  4 * 32));
2791          __m128i in05  = _mm_loadu_si128((const __m128i *)(in +  5 * 32));
2792          __m128i in06  = _mm_loadu_si128((const __m128i *)(in +  6 * 32));
2793          __m128i in07  = _mm_loadu_si128((const __m128i *)(in +  7 * 32));
2794          __m128i in24  = _mm_loadu_si128((const __m128i *)(in + 24 * 32));
2795          __m128i in25  = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
2796          __m128i in26  = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
2797          __m128i in27  = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
2798          step1[ 4] = _mm_add_epi16(in04, in27);
2799          step1[ 5] = _mm_add_epi16(in05, in26);
2800          step1[ 6] = _mm_add_epi16(in06, in25);
2801          step1[ 7] = _mm_add_epi16(in07, in24);
2802          step1[24] = _mm_sub_epi16(in07, in24);
2803          step1[25] = _mm_sub_epi16(in06, in25);
2804          step1[26] = _mm_sub_epi16(in05, in26);
2805          step1[27] = _mm_sub_epi16(in04, in27);
2806        }
2807        {
2808          __m128i in08  = _mm_loadu_si128((const __m128i *)(in +  8 * 32));
2809          __m128i in09  = _mm_loadu_si128((const __m128i *)(in +  9 * 32));
2810          __m128i in10  = _mm_loadu_si128((const __m128i *)(in + 10 * 32));
2811          __m128i in11  = _mm_loadu_si128((const __m128i *)(in + 11 * 32));
2812          __m128i in20  = _mm_loadu_si128((const __m128i *)(in + 20 * 32));
2813          __m128i in21  = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
2814          __m128i in22  = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
2815          __m128i in23  = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
2816          step1[ 8] = _mm_add_epi16(in08, in23);
2817          step1[ 9] = _mm_add_epi16(in09, in22);
2818          step1[10] = _mm_add_epi16(in10, in21);
2819          step1[11] = _mm_add_epi16(in11, in20);
2820          step1[20] = _mm_sub_epi16(in11, in20);
2821          step1[21] = _mm_sub_epi16(in10, in21);
2822          step1[22] = _mm_sub_epi16(in09, in22);
2823          step1[23] = _mm_sub_epi16(in08, in23);
2824        }
2825        {
2826          __m128i in12  = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
2827          __m128i in13  = _mm_loadu_si128((const __m128i *)(in + 13 * 32));
2828          __m128i in14  = _mm_loadu_si128((const __m128i *)(in + 14 * 32));
2829          __m128i in15  = _mm_loadu_si128((const __m128i *)(in + 15 * 32));
2830          __m128i in16  = _mm_loadu_si128((const __m128i *)(in + 16 * 32));
2831          __m128i in17  = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
2832          __m128i in18  = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
2833          __m128i in19  = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
2834          step1[12] = _mm_add_epi16(in12, in19);
2835          step1[13] = _mm_add_epi16(in13, in18);
2836          step1[14] = _mm_add_epi16(in14, in17);
2837          step1[15] = _mm_add_epi16(in15, in16);
2838          step1[16] = _mm_sub_epi16(in15, in16);
2839          step1[17] = _mm_sub_epi16(in14, in17);
2840          step1[18] = _mm_sub_epi16(in13, in18);
2841          step1[19] = _mm_sub_epi16(in12, in19);
2842        }
2843      }
2844      // Stage 2
2845      {
2846        step2[ 0] = _mm_add_epi16(step1[0], step1[15]);
2847        step2[ 1] = _mm_add_epi16(step1[1], step1[14]);
2848        step2[ 2] = _mm_add_epi16(step1[2], step1[13]);
2849        step2[ 3] = _mm_add_epi16(step1[3], step1[12]);
2850        step2[ 4] = _mm_add_epi16(step1[4], step1[11]);
2851        step2[ 5] = _mm_add_epi16(step1[5], step1[10]);
2852        step2[ 6] = _mm_add_epi16(step1[6], step1[ 9]);
2853        step2[ 7] = _mm_add_epi16(step1[7], step1[ 8]);
2854        step2[ 8] = _mm_sub_epi16(step1[7], step1[ 8]);
2855        step2[ 9] = _mm_sub_epi16(step1[6], step1[ 9]);
2856        step2[10] = _mm_sub_epi16(step1[5], step1[10]);
2857        step2[11] = _mm_sub_epi16(step1[4], step1[11]);
2858        step2[12] = _mm_sub_epi16(step1[3], step1[12]);
2859        step2[13] = _mm_sub_epi16(step1[2], step1[13]);
2860        step2[14] = _mm_sub_epi16(step1[1], step1[14]);
2861        step2[15] = _mm_sub_epi16(step1[0], step1[15]);
2862      }
2863      {
2864        const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
2865        const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
2866        const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
2867        const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
2868        const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
2869        const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
2870        const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
2871        const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
2872        const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
2873        const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
2874        const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
2875        const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
2876        const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
2877        const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
2878        const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
2879        const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
2880        const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
2881        const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
2882        const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
2883        const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
2884        const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
2885        const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
2886        const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
2887        const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
2888        // dct_const_round_shift
2889        const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
2890        const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
2891        const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
2892        const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
2893        const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
2894        const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
2895        const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
2896        const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
2897        const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
2898        const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
2899        const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
2900        const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
2901        const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
2902        const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
2903        const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
2904        const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
2905        const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
2906        const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
2907        const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
2908        const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
2909        const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
2910        const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
2911        const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
2912        const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
2913        const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
2914        const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
2915        const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
2916        const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
2917        const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
2918        const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
2919        const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
2920        const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
2921        // Combine
2922        step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
2923        step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
2924        step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
2925        step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
2926        step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
2927        step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
2928        step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
2929        step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
2930      }
2931      // Stage 3
2932      {
2933        step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]);
2934        step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]);
2935        step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]);
2936        step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]);
2937        step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]);
2938        step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]);
2939        step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]);
2940        step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]);
2941      }
2942      {
2943        const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
2944        const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
2945        const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
2946        const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
2947        const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
2948        const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
2949        const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
2950        const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
2951        const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
2952        const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
2953        const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
2954        const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
2955        // dct_const_round_shift
2956        const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
2957        const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
2958        const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
2959        const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
2960        const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
2961        const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
2962        const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
2963        const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
2964        const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
2965        const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
2966        const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
2967        const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
2968        const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
2969        const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
2970        const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
2971        const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
2972        // Combine
2973        step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
2974        step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
2975        step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
2976        step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
2977      }
2978      {
2979        step3[16] = _mm_add_epi16(step2[23], step1[16]);
2980        step3[17] = _mm_add_epi16(step2[22], step1[17]);
2981        step3[18] = _mm_add_epi16(step2[21], step1[18]);
2982        step3[19] = _mm_add_epi16(step2[20], step1[19]);
2983        step3[20] = _mm_sub_epi16(step1[19], step2[20]);
2984        step3[21] = _mm_sub_epi16(step1[18], step2[21]);
2985        step3[22] = _mm_sub_epi16(step1[17], step2[22]);
2986        step3[23] = _mm_sub_epi16(step1[16], step2[23]);
2987        step3[24] = _mm_sub_epi16(step1[31], step2[24]);
2988        step3[25] = _mm_sub_epi16(step1[30], step2[25]);
2989        step3[26] = _mm_sub_epi16(step1[29], step2[26]);
2990        step3[27] = _mm_sub_epi16(step1[28], step2[27]);
2991        step3[28] = _mm_add_epi16(step2[27], step1[28]);
2992        step3[29] = _mm_add_epi16(step2[26], step1[29]);
2993        step3[30] = _mm_add_epi16(step2[25], step1[30]);
2994        step3[31] = _mm_add_epi16(step2[24], step1[31]);
2995      }
2996      // dump the magnitude by half, hence the intermediate values are within
2997      // the range of 16 bits.
2998      if (1 == pass) {
2999        __m128i s3_00_0 = _mm_cmplt_epi16(step3[ 0], kZero);
3000        __m128i s3_01_0 = _mm_cmplt_epi16(step3[ 1], kZero);
3001        __m128i s3_02_0 = _mm_cmplt_epi16(step3[ 2], kZero);
3002        __m128i s3_03_0 = _mm_cmplt_epi16(step3[ 3], kZero);
3003        __m128i s3_04_0 = _mm_cmplt_epi16(step3[ 4], kZero);
3004        __m128i s3_05_0 = _mm_cmplt_epi16(step3[ 5], kZero);
3005        __m128i s3_06_0 = _mm_cmplt_epi16(step3[ 6], kZero);
3006        __m128i s3_07_0 = _mm_cmplt_epi16(step3[ 7], kZero);
3007        __m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero);
3008        __m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero);
3009        __m128i s3_10_0 = _mm_cmplt_epi16(step3[10], kZero);
3010        __m128i s3_11_0 = _mm_cmplt_epi16(step3[11], kZero);
3011        __m128i s3_12_0 = _mm_cmplt_epi16(step3[12], kZero);
3012        __m128i s3_13_0 = _mm_cmplt_epi16(step3[13], kZero);
3013        __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
3014        __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
3015        __m128i s3_16_0 = _mm_cmplt_epi16(step3[16], kZero);
3016        __m128i s3_17_0 = _mm_cmplt_epi16(step3[17], kZero);
3017        __m128i s3_18_0 = _mm_cmplt_epi16(step3[18], kZero);
3018        __m128i s3_19_0 = _mm_cmplt_epi16(step3[19], kZero);
3019        __m128i s3_20_0 = _mm_cmplt_epi16(step3[20], kZero);
3020        __m128i s3_21_0 = _mm_cmplt_epi16(step3[21], kZero);
3021        __m128i s3_22_0 = _mm_cmplt_epi16(step3[22], kZero);
3022        __m128i s3_23_0 = _mm_cmplt_epi16(step3[23], kZero);
3023        __m128i s3_24_0 = _mm_cmplt_epi16(step3[24], kZero);
3024        __m128i s3_25_0 = _mm_cmplt_epi16(step3[25], kZero);
3025        __m128i s3_26_0 = _mm_cmplt_epi16(step3[26], kZero);
3026        __m128i s3_27_0 = _mm_cmplt_epi16(step3[27], kZero);
3027        __m128i s3_28_0 = _mm_cmplt_epi16(step3[28], kZero);
3028        __m128i s3_29_0 = _mm_cmplt_epi16(step3[29], kZero);
3029        __m128i s3_30_0 = _mm_cmplt_epi16(step3[30], kZero);
3030        __m128i s3_31_0 = _mm_cmplt_epi16(step3[31], kZero);
3031        step3[ 0] = _mm_sub_epi16(step3[ 0], s3_00_0);
3032        step3[ 1] = _mm_sub_epi16(step3[ 1], s3_01_0);
3033        step3[ 2] = _mm_sub_epi16(step3[ 2], s3_02_0);
3034        step3[ 3] = _mm_sub_epi16(step3[ 3], s3_03_0);
3035        step3[ 4] = _mm_sub_epi16(step3[ 4], s3_04_0);
3036        step3[ 5] = _mm_sub_epi16(step3[ 5], s3_05_0);
3037        step3[ 6] = _mm_sub_epi16(step3[ 6], s3_06_0);
3038        step3[ 7] = _mm_sub_epi16(step3[ 7], s3_07_0);
3039        step2[ 8] = _mm_sub_epi16(step2[ 8], s2_08_0);
3040        step2[ 9] = _mm_sub_epi16(step2[ 9], s2_09_0);
3041        step3[10] = _mm_sub_epi16(step3[10], s3_10_0);
3042        step3[11] = _mm_sub_epi16(step3[11], s3_11_0);
3043        step3[12] = _mm_sub_epi16(step3[12], s3_12_0);
3044        step3[13] = _mm_sub_epi16(step3[13], s3_13_0);
3045        step2[14] = _mm_sub_epi16(step2[14], s2_14_0);
3046        step2[15] = _mm_sub_epi16(step2[15], s2_15_0);
3047        step3[16] = _mm_sub_epi16(step3[16], s3_16_0);
3048        step3[17] = _mm_sub_epi16(step3[17], s3_17_0);
3049        step3[18] = _mm_sub_epi16(step3[18], s3_18_0);
3050        step3[19] = _mm_sub_epi16(step3[19], s3_19_0);
3051        step3[20] = _mm_sub_epi16(step3[20], s3_20_0);
3052        step3[21] = _mm_sub_epi16(step3[21], s3_21_0);
3053        step3[22] = _mm_sub_epi16(step3[22], s3_22_0);
3054        step3[23] = _mm_sub_epi16(step3[23], s3_23_0);
3055        step3[24] = _mm_sub_epi16(step3[24], s3_24_0);
3056        step3[25] = _mm_sub_epi16(step3[25], s3_25_0);
3057        step3[26] = _mm_sub_epi16(step3[26], s3_26_0);
3058        step3[27] = _mm_sub_epi16(step3[27], s3_27_0);
3059        step3[28] = _mm_sub_epi16(step3[28], s3_28_0);
3060        step3[29] = _mm_sub_epi16(step3[29], s3_29_0);
3061        step3[30] = _mm_sub_epi16(step3[30], s3_30_0);
3062        step3[31] = _mm_sub_epi16(step3[31], s3_31_0);
3063        step3[ 0] = _mm_add_epi16(step3[ 0], kOne);
3064        step3[ 1] = _mm_add_epi16(step3[ 1], kOne);
3065        step3[ 2] = _mm_add_epi16(step3[ 2], kOne);
3066        step3[ 3] = _mm_add_epi16(step3[ 3], kOne);
3067        step3[ 4] = _mm_add_epi16(step3[ 4], kOne);
3068        step3[ 5] = _mm_add_epi16(step3[ 5], kOne);
3069        step3[ 6] = _mm_add_epi16(step3[ 6], kOne);
3070        step3[ 7] = _mm_add_epi16(step3[ 7], kOne);
3071        step2[ 8] = _mm_add_epi16(step2[ 8], kOne);
3072        step2[ 9] = _mm_add_epi16(step2[ 9], kOne);
3073        step3[10] = _mm_add_epi16(step3[10], kOne);
3074        step3[11] = _mm_add_epi16(step3[11], kOne);
3075        step3[12] = _mm_add_epi16(step3[12], kOne);
3076        step3[13] = _mm_add_epi16(step3[13], kOne);
3077        step2[14] = _mm_add_epi16(step2[14], kOne);
3078        step2[15] = _mm_add_epi16(step2[15], kOne);
3079        step3[16] = _mm_add_epi16(step3[16], kOne);
3080        step3[17] = _mm_add_epi16(step3[17], kOne);
3081        step3[18] = _mm_add_epi16(step3[18], kOne);
3082        step3[19] = _mm_add_epi16(step3[19], kOne);
3083        step3[20] = _mm_add_epi16(step3[20], kOne);
3084        step3[21] = _mm_add_epi16(step3[21], kOne);
3085        step3[22] = _mm_add_epi16(step3[22], kOne);
3086        step3[23] = _mm_add_epi16(step3[23], kOne);
3087        step3[24] = _mm_add_epi16(step3[24], kOne);
3088        step3[25] = _mm_add_epi16(step3[25], kOne);
3089        step3[26] = _mm_add_epi16(step3[26], kOne);
3090        step3[27] = _mm_add_epi16(step3[27], kOne);
3091        step3[28] = _mm_add_epi16(step3[28], kOne);
3092        step3[29] = _mm_add_epi16(step3[29], kOne);
3093        step3[30] = _mm_add_epi16(step3[30], kOne);
3094        step3[31] = _mm_add_epi16(step3[31], kOne);
3095        step3[ 0] = _mm_srai_epi16(step3[ 0], 2);
3096        step3[ 1] = _mm_srai_epi16(step3[ 1], 2);
3097        step3[ 2] = _mm_srai_epi16(step3[ 2], 2);
3098        step3[ 3] = _mm_srai_epi16(step3[ 3], 2);
3099        step3[ 4] = _mm_srai_epi16(step3[ 4], 2);
3100        step3[ 5] = _mm_srai_epi16(step3[ 5], 2);
3101        step3[ 6] = _mm_srai_epi16(step3[ 6], 2);
3102        step3[ 7] = _mm_srai_epi16(step3[ 7], 2);
3103        step2[ 8] = _mm_srai_epi16(step2[ 8], 2);
3104        step2[ 9] = _mm_srai_epi16(step2[ 9], 2);
3105        step3[10] = _mm_srai_epi16(step3[10], 2);
3106        step3[11] = _mm_srai_epi16(step3[11], 2);
3107        step3[12] = _mm_srai_epi16(step3[12], 2);
3108        step3[13] = _mm_srai_epi16(step3[13], 2);
3109        step2[14] = _mm_srai_epi16(step2[14], 2);
3110        step2[15] = _mm_srai_epi16(step2[15], 2);
3111        step3[16] = _mm_srai_epi16(step3[16], 2);
3112        step3[17] = _mm_srai_epi16(step3[17], 2);
3113        step3[18] = _mm_srai_epi16(step3[18], 2);
3114        step3[19] = _mm_srai_epi16(step3[19], 2);
3115        step3[20] = _mm_srai_epi16(step3[20], 2);
3116        step3[21] = _mm_srai_epi16(step3[21], 2);
3117        step3[22] = _mm_srai_epi16(step3[22], 2);
3118        step3[23] = _mm_srai_epi16(step3[23], 2);
3119        step3[24] = _mm_srai_epi16(step3[24], 2);
3120        step3[25] = _mm_srai_epi16(step3[25], 2);
3121        step3[26] = _mm_srai_epi16(step3[26], 2);
3122        step3[27] = _mm_srai_epi16(step3[27], 2);
3123        step3[28] = _mm_srai_epi16(step3[28], 2);
3124        step3[29] = _mm_srai_epi16(step3[29], 2);
3125        step3[30] = _mm_srai_epi16(step3[30], 2);
3126        step3[31] = _mm_srai_epi16(step3[31], 2);
3127      }
3128      // Stage 4
3129      {
3130        step1[ 0] = _mm_add_epi16(step3[ 3], step3[ 0]);
3131        step1[ 1] = _mm_add_epi16(step3[ 2], step3[ 1]);
3132        step1[ 2] = _mm_sub_epi16(step3[ 1], step3[ 2]);
3133        step1[ 3] = _mm_sub_epi16(step3[ 0], step3[ 3]);
3134        step1[ 8] = _mm_add_epi16(step3[11], step2[ 8]);
3135        step1[ 9] = _mm_add_epi16(step3[10], step2[ 9]);
3136        step1[10] = _mm_sub_epi16(step2[ 9], step3[10]);
3137        step1[11] = _mm_sub_epi16(step2[ 8], step3[11]);
3138        step1[12] = _mm_sub_epi16(step2[15], step3[12]);
3139        step1[13] = _mm_sub_epi16(step2[14], step3[13]);
3140        step1[14] = _mm_add_epi16(step3[13], step2[14]);
3141        step1[15] = _mm_add_epi16(step3[12], step2[15]);
3142      }
3143      {
3144        const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
3145        const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
3146        const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
3147        const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
3148        const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
3149        const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
3150        // dct_const_round_shift
3151        const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
3152        const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
3153        const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
3154        const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
3155        const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
3156        const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
3157        const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
3158        const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
3159        // Combine
3160        step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
3161        step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
3162      }
3163      {
3164        const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
3165        const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
3166        const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
3167        const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
3168        const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
3169        const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
3170        const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
3171        const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
3172        const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
3173        const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
3174        const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
3175        const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
3176        const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
3177        const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
3178        const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
3179        const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
3180        const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
3181        const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
3182        const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
3183        const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
3184        const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
3185        const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
3186        const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
3187        const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
3188        // dct_const_round_shift
3189        const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
3190        const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
3191        const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
3192        const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
3193        const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
3194        const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
3195        const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
3196        const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
3197        const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
3198        const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
3199        const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
3200        const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
3201        const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
3202        const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
3203        const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
3204        const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
3205        const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
3206        const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
3207        const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
3208        const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
3209        const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
3210        const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
3211        const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
3212        const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
3213        const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
3214        const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
3215        const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
3216        const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
3217        const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
3218        const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
3219        const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
3220        const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
3221        // Combine
3222        step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
3223        step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
3224        step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
3225        step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
3226        step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
3227        step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
3228        step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
3229        step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
3230      }
3231      // Stage 5
3232      {
3233        step2[4] = _mm_add_epi16(step1[5], step3[4]);
3234        step2[5] = _mm_sub_epi16(step3[4], step1[5]);
3235        step2[6] = _mm_sub_epi16(step3[7], step1[6]);
3236        step2[7] = _mm_add_epi16(step1[6], step3[7]);
3237      }
3238      {
3239        const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
3240        const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
3241        const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
3242        const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
3243        const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
3244        const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
3245        const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
3246        const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
3247        const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
3248        const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
3249        const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
3250        const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
3251        // dct_const_round_shift
3252        const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
3253        const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
3254        const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
3255        const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
3256        const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
3257        const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
3258        const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
3259        const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
3260        const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
3261        const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
3262        const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
3263        const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
3264        const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
3265        const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
3266        const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
3267        const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
3268        // Combine
3269        out[ 0] = _mm_packs_epi32(out_00_6, out_00_7);
3270        out[16] = _mm_packs_epi32(out_16_6, out_16_7);
3271        out[ 8] = _mm_packs_epi32(out_08_6, out_08_7);
3272        out[24] = _mm_packs_epi32(out_24_6, out_24_7);
3273      }
3274      {
3275        const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[ 9], step1[14]);
3276        const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[ 9], step1[14]);
3277        const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
3278        const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
3279        const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
3280        const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
3281        const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
3282        const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
3283        const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
3284        const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
3285        const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
3286        const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
3287        // dct_const_round_shift
3288        const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
3289        const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
3290        const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
3291        const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
3292        const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
3293        const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
3294        const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
3295        const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
3296        const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
3297        const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
3298        const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
3299        const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
3300        const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
3301        const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
3302        const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
3303        const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
3304        // Combine
3305        step2[ 9] = _mm_packs_epi32(s2_09_6, s2_09_7);
3306        step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
3307        step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
3308        step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
3309      }
3310      {
3311        step2[16] = _mm_add_epi16(step1[19], step3[16]);
3312        step2[17] = _mm_add_epi16(step1[18], step3[17]);
3313        step2[18] = _mm_sub_epi16(step3[17], step1[18]);
3314        step2[19] = _mm_sub_epi16(step3[16], step1[19]);
3315        step2[20] = _mm_sub_epi16(step3[23], step1[20]);
3316        step2[21] = _mm_sub_epi16(step3[22], step1[21]);
3317        step2[22] = _mm_add_epi16(step1[21], step3[22]);
3318        step2[23] = _mm_add_epi16(step1[20], step3[23]);
3319        step2[24] = _mm_add_epi16(step1[27], step3[24]);
3320        step2[25] = _mm_add_epi16(step1[26], step3[25]);
3321        step2[26] = _mm_sub_epi16(step3[25], step1[26]);
3322        step2[27] = _mm_sub_epi16(step3[24], step1[27]);
3323        step2[28] = _mm_sub_epi16(step3[31], step1[28]);
3324        step2[29] = _mm_sub_epi16(step3[30], step1[29]);
3325        step2[30] = _mm_add_epi16(step1[29], step3[30]);
3326        step2[31] = _mm_add_epi16(step1[28], step3[31]);
3327      }
3328      // Stage 6
3329      {
3330        const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
3331        const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
3332        const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
3333        const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
3334        const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
3335        const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
3336        const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
3337        const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
3338        const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
3339        const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
3340        const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
3341        const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
3342        const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
3343        const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
3344        const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
3345        const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
3346        // dct_const_round_shift
3347        const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
3348        const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
3349        const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
3350        const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
3351        const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
3352        const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
3353        const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
3354        const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
3355        const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
3356        const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
3357        const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
3358        const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
3359        const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
3360        const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
3361        const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
3362        const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
3363        // Combine
3364        out[ 4] = _mm_packs_epi32(out_04_6, out_04_7);
3365        out[20] = _mm_packs_epi32(out_20_6, out_20_7);
3366        out[12] = _mm_packs_epi32(out_12_6, out_12_7);
3367        out[28] = _mm_packs_epi32(out_28_6, out_28_7);
3368      }
3369      {
3370        step3[ 8] = _mm_add_epi16(step2[ 9], step1[ 8]);
3371        step3[ 9] = _mm_sub_epi16(step1[ 8], step2[ 9]);
3372        step3[10] = _mm_sub_epi16(step1[11], step2[10]);
3373        step3[11] = _mm_add_epi16(step2[10], step1[11]);
3374        step3[12] = _mm_add_epi16(step2[13], step1[12]);
3375        step3[13] = _mm_sub_epi16(step1[12], step2[13]);
3376        step3[14] = _mm_sub_epi16(step1[15], step2[14]);
3377        step3[15] = _mm_add_epi16(step2[14], step1[15]);
3378      }
3379      {
3380        const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
3381        const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
3382        const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
3383        const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
3384        const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
3385        const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
3386        const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
3387        const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
3388        const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
3389        const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
3390        const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
3391        const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
3392        const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
3393        const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
3394        const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
3395        const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
3396        const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
3397        const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
3398        const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
3399        const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
3400        const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
3401        const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
3402        const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
3403        const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
3404        // dct_const_round_shift
3405        const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
3406        const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
3407        const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
3408        const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
3409        const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
3410        const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
3411        const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
3412        const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
3413        const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
3414        const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
3415        const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
3416        const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
3417        const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
3418        const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
3419        const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
3420        const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
3421        const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
3422        const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
3423        const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
3424        const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
3425        const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
3426        const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
3427        const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
3428        const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
3429        const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
3430        const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
3431        const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
3432        const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
3433        const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
3434        const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
3435        const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
3436        const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
3437        // Combine
3438        step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
3439        step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
3440        step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
3441        step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
3442        // Combine
3443        step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
3444        step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
3445        step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
3446        step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
3447      }
3448      // Stage 7
3449      {
3450        const __m128i out_02_0 = _mm_unpacklo_epi16(step3[ 8], step3[15]);
3451        const __m128i out_02_1 = _mm_unpackhi_epi16(step3[ 8], step3[15]);
3452        const __m128i out_18_0 = _mm_unpacklo_epi16(step3[ 9], step3[14]);
3453        const __m128i out_18_1 = _mm_unpackhi_epi16(step3[ 9], step3[14]);
3454        const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
3455        const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
3456        const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
3457        const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
3458        const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
3459        const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
3460        const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
3461        const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
3462        const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
3463        const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
3464        const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
3465        const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
3466        const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
3467        const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
3468        const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
3469        const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
3470        const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
3471        const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
3472        const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
3473        const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
3474        // dct_const_round_shift
3475        const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
3476        const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
3477        const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
3478        const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
3479        const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
3480        const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
3481        const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
3482        const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
3483        const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
3484        const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
3485        const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
3486        const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
3487        const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
3488        const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
3489        const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
3490        const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
3491        const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
3492        const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
3493        const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
3494        const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
3495        const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
3496        const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
3497        const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
3498        const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
3499        const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
3500        const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
3501        const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
3502        const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
3503        const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
3504        const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
3505        const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
3506        const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
3507        // Combine
3508        out[ 2] = _mm_packs_epi32(out_02_6, out_02_7);
3509        out[18] = _mm_packs_epi32(out_18_6, out_18_7);
3510        out[10] = _mm_packs_epi32(out_10_6, out_10_7);
3511        out[26] = _mm_packs_epi32(out_26_6, out_26_7);
3512        out[ 6] = _mm_packs_epi32(out_06_6, out_06_7);
3513        out[22] = _mm_packs_epi32(out_22_6, out_22_7);
3514        out[14] = _mm_packs_epi32(out_14_6, out_14_7);
3515        out[30] = _mm_packs_epi32(out_30_6, out_30_7);
3516      }
3517      {
3518        step1[16] = _mm_add_epi16(step3[17], step2[16]);
3519        step1[17] = _mm_sub_epi16(step2[16], step3[17]);
3520        step1[18] = _mm_sub_epi16(step2[19], step3[18]);
3521        step1[19] = _mm_add_epi16(step3[18], step2[19]);
3522        step1[20] = _mm_add_epi16(step3[21], step2[20]);
3523        step1[21] = _mm_sub_epi16(step2[20], step3[21]);
3524        step1[22] = _mm_sub_epi16(step2[23], step3[22]);
3525        step1[23] = _mm_add_epi16(step3[22], step2[23]);
3526        step1[24] = _mm_add_epi16(step3[25], step2[24]);
3527        step1[25] = _mm_sub_epi16(step2[24], step3[25]);
3528        step1[26] = _mm_sub_epi16(step2[27], step3[26]);
3529        step1[27] = _mm_add_epi16(step3[26], step2[27]);
3530        step1[28] = _mm_add_epi16(step3[29], step2[28]);
3531        step1[29] = _mm_sub_epi16(step2[28], step3[29]);
3532        step1[30] = _mm_sub_epi16(step2[31], step3[30]);
3533        step1[31] = _mm_add_epi16(step3[30], step2[31]);
3534      }
3535      // Final stage --- outputs indices are bit-reversed.
3536      {
3537        const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
3538        const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
3539        const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
3540        const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
3541        const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
3542        const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
3543        const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
3544        const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
3545        const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
3546        const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
3547        const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
3548        const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
3549        const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
3550        const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
3551        const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
3552        const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
3553        const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
3554        const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
3555        const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
3556        const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
3557        const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
3558        const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
3559        const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
3560        const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
3561        // dct_const_round_shift
3562        const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
3563        const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
3564        const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
3565        const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
3566        const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
3567        const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
3568        const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
3569        const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
3570        const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
3571        const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
3572        const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
3573        const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
3574        const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
3575        const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
3576        const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
3577        const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
3578        const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
3579        const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
3580        const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
3581        const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
3582        const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
3583        const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
3584        const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
3585        const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
3586        const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
3587        const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
3588        const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
3589        const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
3590        const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
3591        const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
3592        const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
3593        const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
3594        // Combine
3595        out[ 1] = _mm_packs_epi32(out_01_6, out_01_7);
3596        out[17] = _mm_packs_epi32(out_17_6, out_17_7);
3597        out[ 9] = _mm_packs_epi32(out_09_6, out_09_7);
3598        out[25] = _mm_packs_epi32(out_25_6, out_25_7);
3599        out[ 7] = _mm_packs_epi32(out_07_6, out_07_7);
3600        out[23] = _mm_packs_epi32(out_23_6, out_23_7);
3601        out[15] = _mm_packs_epi32(out_15_6, out_15_7);
3602        out[31] = _mm_packs_epi32(out_31_6, out_31_7);
3603      }
3604      {
3605        const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
3606        const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
3607        const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
3608        const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
3609        const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
3610        const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
3611        const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
3612        const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
3613        const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
3614        const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
3615        const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
3616        const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
3617        const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
3618        const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
3619        const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
3620        const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
3621        const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
3622        const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
3623        const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
3624        const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
3625        const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
3626        const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
3627        const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
3628        const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
3629        // dct_const_round_shift
3630        const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
3631        const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
3632        const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
3633        const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
3634        const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
3635        const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
3636        const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
3637        const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
3638        const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
3639        const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
3640        const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
3641        const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
3642        const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
3643        const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
3644        const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
3645        const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
3646        const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
3647        const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
3648        const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
3649        const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
3650        const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
3651        const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
3652        const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
3653        const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
3654        const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
3655        const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
3656        const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
3657        const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
3658        const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
3659        const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
3660        const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
3661        const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
3662        // Combine
3663        out[ 5] = _mm_packs_epi32(out_05_6, out_05_7);
3664        out[21] = _mm_packs_epi32(out_21_6, out_21_7);
3665        out[13] = _mm_packs_epi32(out_13_6, out_13_7);
3666        out[29] = _mm_packs_epi32(out_29_6, out_29_7);
3667        out[ 3] = _mm_packs_epi32(out_03_6, out_03_7);
3668        out[19] = _mm_packs_epi32(out_19_6, out_19_7);
3669        out[11] = _mm_packs_epi32(out_11_6, out_11_7);
3670        out[27] = _mm_packs_epi32(out_27_6, out_27_7);
3671      }
3672      // Transpose the results, do it as four 8x8 transposes.
3673      {
3674        int transpose_block;
3675        int16_t *output;
3676        if (0 == pass) {
3677          output = &intermediate[column_start * 32];
3678        } else {
3679          output = &output_org[column_start * 32];
3680        }
3681        for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
3682          __m128i *this_out = &out[8 * transpose_block];
3683          // 00 01 02 03 04 05 06 07
3684          // 10 11 12 13 14 15 16 17
3685          // 20 21 22 23 24 25 26 27
3686          // 30 31 32 33 34 35 36 37
3687          // 40 41 42 43 44 45 46 47
3688          // 50 51 52 53 54 55 56 57
3689          // 60 61 62 63 64 65 66 67
3690          // 70 71 72 73 74 75 76 77
3691          const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]);
3692          const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]);
3693          const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]);
3694          const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]);
3695          const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]);
3696          const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]);
3697          const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]);
3698          const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]);
3699          // 00 10 01 11 02 12 03 13
3700          // 20 30 21 31 22 32 23 33
3701          // 04 14 05 15 06 16 07 17
3702          // 24 34 25 35 26 36 27 37
3703          // 40 50 41 51 42 52 43 53
3704          // 60 70 61 71 62 72 63 73
3705          // 54 54 55 55 56 56 57 57
3706          // 64 74 65 75 66 76 67 77
3707          const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
3708          const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
3709          const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
3710          const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
3711          const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
3712          const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
3713          const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
3714          const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
3715          // 00 10 20 30 01 11 21 31
3716          // 40 50 60 70 41 51 61 71
3717          // 02 12 22 32 03 13 23 33
3718          // 42 52 62 72 43 53 63 73
3719          // 04 14 24 34 05 15 21 36
3720          // 44 54 64 74 45 55 61 76
3721          // 06 16 26 36 07 17 27 37
3722          // 46 56 66 76 47 57 67 77
3723          __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
3724          __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
3725          __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
3726          __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
3727          __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
3728          __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
3729          __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
3730          __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
3731          // 00 10 20 30 40 50 60 70
3732          // 01 11 21 31 41 51 61 71
3733          // 02 12 22 32 42 52 62 72
3734          // 03 13 23 33 43 53 63 73
3735          // 04 14 24 34 44 54 64 74
3736          // 05 15 25 35 45 55 65 75
3737          // 06 16 26 36 46 56 66 76
3738          // 07 17 27 37 47 57 67 77
3739          if (0 == pass) {
3740            // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
3741            // TODO(cd): see quality impact of only doing
3742            //           output[j] = (output[j] + 1) >> 2;
3743            //           which would remove the code between here ...
3744            __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero);
3745            __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero);
3746            __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero);
3747            __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero);
3748            __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero);
3749            __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero);
3750            __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero);
3751            __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero);
3752            tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0);
3753            tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0);
3754            tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0);
3755            tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0);
3756            tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0);
3757            tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0);
3758            tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0);
3759            tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0);
3760            //           ... and here.
3761            //           PS: also change code in vp9/encoder/vp9_dct.c
3762            tr2_0 = _mm_add_epi16(tr2_0, kOne);
3763            tr2_1 = _mm_add_epi16(tr2_1, kOne);
3764            tr2_2 = _mm_add_epi16(tr2_2, kOne);
3765            tr2_3 = _mm_add_epi16(tr2_3, kOne);
3766            tr2_4 = _mm_add_epi16(tr2_4, kOne);
3767            tr2_5 = _mm_add_epi16(tr2_5, kOne);
3768            tr2_6 = _mm_add_epi16(tr2_6, kOne);
3769            tr2_7 = _mm_add_epi16(tr2_7, kOne);
3770            tr2_0 = _mm_srai_epi16(tr2_0, 2);
3771            tr2_1 = _mm_srai_epi16(tr2_1, 2);
3772            tr2_2 = _mm_srai_epi16(tr2_2, 2);
3773            tr2_3 = _mm_srai_epi16(tr2_3, 2);
3774            tr2_4 = _mm_srai_epi16(tr2_4, 2);
3775            tr2_5 = _mm_srai_epi16(tr2_5, 2);
3776            tr2_6 = _mm_srai_epi16(tr2_6, 2);
3777            tr2_7 = _mm_srai_epi16(tr2_7, 2);
3778          }
3779          // Note: even though all these stores are aligned, using the aligned
3780          //       intrinsic make the code slightly slower.
3781          _mm_storeu_si128((__m128i *)(output + 0 * 32), tr2_0);
3782          _mm_storeu_si128((__m128i *)(output + 1 * 32), tr2_1);
3783          _mm_storeu_si128((__m128i *)(output + 2 * 32), tr2_2);
3784          _mm_storeu_si128((__m128i *)(output + 3 * 32), tr2_3);
3785          _mm_storeu_si128((__m128i *)(output + 4 * 32), tr2_4);
3786          _mm_storeu_si128((__m128i *)(output + 5 * 32), tr2_5);
3787          _mm_storeu_si128((__m128i *)(output + 6 * 32), tr2_6);
3788          _mm_storeu_si128((__m128i *)(output + 7 * 32), tr2_7);
3789          // Process next 8x8
3790          output += 8;
3791        }
3792      }
3793    }
3794  }
3795}
3796