vp9_idct_intrin_sse2.c revision ba164dffc5a6795bce97fae02b51ccf3330e15e4
1/*
2 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <emmintrin.h>  // SSE2
13#include "./vpx_config.h"
14#include "vpx/vpx_integer.h"
15#include "vp9/common/vp9_common.h"
16#include "vp9/common/vp9_idct.h"
17
18// In order to improve performance, clip absolute diff values to [0, 255],
19// which allows to keep the additions/subtractions in 8 bits.
20void vp9_dc_only_idct_add_sse2(int input_dc, uint8_t *pred_ptr,
21                               uint8_t *dst_ptr, int pitch, int stride) {
22  int a1;
23  int16_t out;
24  uint8_t abs_diff;
25  __m128i p0, p1, p2, p3;
26  unsigned int extended_diff;
27  __m128i diff;
28
29  out = dct_const_round_shift(input_dc * cospi_16_64);
30  out = dct_const_round_shift(out * cospi_16_64);
31  a1 = ROUND_POWER_OF_TWO(out, 4);
32
33  // Read prediction data.
34  p0 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 0 * pitch));
35  p1 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 1 * pitch));
36  p2 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 2 * pitch));
37  p3 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 3 * pitch));
38
39  // Unpack prediction data, and store 4x4 array in 1 XMM register.
40  p0 = _mm_unpacklo_epi32(p0, p1);
41  p2 = _mm_unpacklo_epi32(p2, p3);
42  p0 = _mm_unpacklo_epi64(p0, p2);
43
44  // Clip dc value to [0, 255] range. Then, do addition or subtraction
45  // according to its sign.
46  if (a1 >= 0) {
47    abs_diff = (a1 > 255) ? 255 : a1;
48    extended_diff = abs_diff * 0x01010101u;
49    diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0);
50
51    p1 = _mm_adds_epu8(p0, diff);
52  } else {
53    abs_diff = (a1 < -255) ? 255 : -a1;
54    extended_diff = abs_diff * 0x01010101u;
55    diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0);
56
57    p1 = _mm_subs_epu8(p0, diff);
58  }
59
60  // Store results to dst.
61  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
62  dst_ptr += stride;
63
64  p1 = _mm_srli_si128(p1, 4);
65  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
66  dst_ptr += stride;
67
68  p1 = _mm_srli_si128(p1, 4);
69  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
70  dst_ptr += stride;
71
72  p1 = _mm_srli_si128(p1, 4);
73  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
74}
75
76void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) {
77  const __m128i zero = _mm_setzero_si128();
78  const __m128i eight = _mm_set1_epi16(8);
79  const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
80                                    (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
81                                    (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
82                                    (int16_t)cospi_8_64, (int16_t)cospi_24_64);
83  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
84  __m128i input0, input1, input2, input3;
85
86  // Rows
87  input0 = _mm_loadl_epi64((__m128i *)input);
88  input1 = _mm_loadl_epi64((__m128i *)(input + 4));
89  input2 = _mm_loadl_epi64((__m128i *)(input + 8));
90  input3 = _mm_loadl_epi64((__m128i *)(input + 12));
91
92  // Construct i3, i1, i3, i1, i2, i0, i2, i0
93  input0 = _mm_shufflelo_epi16(input0, 0xd8);
94  input1 = _mm_shufflelo_epi16(input1, 0xd8);
95  input2 = _mm_shufflelo_epi16(input2, 0xd8);
96  input3 = _mm_shufflelo_epi16(input3, 0xd8);
97
98  input0 = _mm_unpacklo_epi32(input0, input0);
99  input1 = _mm_unpacklo_epi32(input1, input1);
100  input2 = _mm_unpacklo_epi32(input2, input2);
101  input3 = _mm_unpacklo_epi32(input3, input3);
102
103  // Stage 1
104  input0 = _mm_madd_epi16(input0, cst);
105  input1 = _mm_madd_epi16(input1, cst);
106  input2 = _mm_madd_epi16(input2, cst);
107  input3 = _mm_madd_epi16(input3, cst);
108
109  input0 = _mm_add_epi32(input0, rounding);
110  input1 = _mm_add_epi32(input1, rounding);
111  input2 = _mm_add_epi32(input2, rounding);
112  input3 = _mm_add_epi32(input3, rounding);
113
114  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
115  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
116  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
117  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
118
119  // Stage 2
120  input0 = _mm_packs_epi32(input0, zero);
121  input1 = _mm_packs_epi32(input1, zero);
122  input2 = _mm_packs_epi32(input2, zero);
123  input3 = _mm_packs_epi32(input3, zero);
124
125  // Transpose
126  input1 = _mm_unpacklo_epi16(input0, input1);
127  input3 = _mm_unpacklo_epi16(input2, input3);
128  input0 = _mm_unpacklo_epi32(input1, input3);
129  input1 = _mm_unpackhi_epi32(input1, input3);
130
131  // Switch column2, column 3, and then, we got:
132  // input2: column1, column 0;  input3: column2, column 3.
133  input1 = _mm_shuffle_epi32(input1, 0x4e);
134  input2 = _mm_add_epi16(input0, input1);
135  input3 = _mm_sub_epi16(input0, input1);
136
137  // Columns
138  // Construct i3, i1, i3, i1, i2, i0, i2, i0
139  input0 = _mm_shufflelo_epi16(input2, 0xd8);
140  input1 = _mm_shufflehi_epi16(input2, 0xd8);
141  input2 = _mm_shufflehi_epi16(input3, 0xd8);
142  input3 = _mm_shufflelo_epi16(input3, 0xd8);
143
144  input0 = _mm_unpacklo_epi32(input0, input0);
145  input1 = _mm_unpackhi_epi32(input1, input1);
146  input2 = _mm_unpackhi_epi32(input2, input2);
147  input3 = _mm_unpacklo_epi32(input3, input3);
148
149  // Stage 1
150  input0 = _mm_madd_epi16(input0, cst);
151  input1 = _mm_madd_epi16(input1, cst);
152  input2 = _mm_madd_epi16(input2, cst);
153  input3 = _mm_madd_epi16(input3, cst);
154
155  input0 = _mm_add_epi32(input0, rounding);
156  input1 = _mm_add_epi32(input1, rounding);
157  input2 = _mm_add_epi32(input2, rounding);
158  input3 = _mm_add_epi32(input3, rounding);
159
160  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
161  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
162  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
163  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
164
165  // Stage 2
166  input0 = _mm_packs_epi32(input0, zero);
167  input1 = _mm_packs_epi32(input1, zero);
168  input2 = _mm_packs_epi32(input2, zero);
169  input3 = _mm_packs_epi32(input3, zero);
170
171  // Transpose
172  input1 = _mm_unpacklo_epi16(input0, input1);
173  input3 = _mm_unpacklo_epi16(input2, input3);
174  input0 = _mm_unpacklo_epi32(input1, input3);
175  input1 = _mm_unpackhi_epi32(input1, input3);
176
177  // Switch column2, column 3, and then, we got:
178  // input2: column1, column 0;  input3: column2, column 3.
179  input1 = _mm_shuffle_epi32(input1, 0x4e);
180  input2 = _mm_add_epi16(input0, input1);
181  input3 = _mm_sub_epi16(input0, input1);
182
183  // Final round and shift
184  input2 = _mm_add_epi16(input2, eight);
185  input3 = _mm_add_epi16(input3, eight);
186
187  input2 = _mm_srai_epi16(input2, 4);
188  input3 = _mm_srai_epi16(input3, 4);
189
190#define RECON_AND_STORE4X4(dest, in_x) \
191  {                                                     \
192      __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
193      d0 = _mm_unpacklo_epi8(d0, zero); \
194      d0 = _mm_add_epi16(in_x, d0); \
195      d0 = _mm_packus_epi16(d0, d0); \
196      *(int *)dest = _mm_cvtsi128_si32(d0); \
197      dest += stride; \
198  }
199
200  input0 = _mm_srli_si128(input2, 8);
201  input1 = _mm_srli_si128(input3, 8);
202
203  RECON_AND_STORE4X4(dest, input2);
204  RECON_AND_STORE4X4(dest, input0);
205  RECON_AND_STORE4X4(dest, input1);
206  RECON_AND_STORE4X4(dest, input3);
207}
208
209void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {
210  const __m128i zero = _mm_setzero_si128();
211  const __m128i c1 = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
212                                    (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
213                                    (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
214                                    (int16_t)cospi_8_64, (int16_t)cospi_24_64);
215  const __m128i c2 = _mm_setr_epi16(1, 1, 1, 1, 1, -1, 1, -1);
216
217  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
218  __m128i in, temp;
219
220  // Load input data.
221  in = _mm_loadl_epi64((__m128i *)input);
222
223  // Construct i3, i1, i3, i1, i2, i0, i2, i0
224  in = _mm_shufflelo_epi16(in, 0xd8);
225  in = _mm_unpacklo_epi32(in, in);
226
227  // Stage 1
228  in = _mm_madd_epi16(in, c1);
229  in = _mm_add_epi32(in, rounding);
230  in = _mm_srai_epi32(in, DCT_CONST_BITS);
231  in = _mm_packs_epi32(in, zero);
232
233  // Stage 2
234  temp = _mm_shufflelo_epi16(in, 0x9c);
235  in = _mm_shufflelo_epi16(in, 0xc9);
236  in = _mm_unpacklo_epi64(temp, in);
237  in = _mm_madd_epi16(in, c2);
238  in = _mm_packs_epi32(in, zero);
239
240  // Store results
241  _mm_storel_epi64((__m128i *)output, in);
242}
243
244#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
245                      out0, out1, out2, out3, out4, out5, out6, out7) \
246  {                                                     \
247    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
248    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
249    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
250    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
251    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
252    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
253    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
254    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
255                                                        \
256    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
257    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
258    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
259    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
260    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
261    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
262    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
263    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
264                                                            \
265    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
266    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
267    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
268    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
269    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
270    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
271    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
272    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
273  }
274
275#define TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, \
276                      out0, out1, out2, out3, out4, out5, out6, out7) \
277  {                                                     \
278    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
279    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
280    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
281    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
282                                                        \
283    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
284    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
285    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
286    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
287                                                            \
288    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
289    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
290    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
291    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
292    out4 = out5 = out6 = out7 = zero; \
293  }
294
295#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1, out2, out3) \
296  {                                                     \
297    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
298    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
299    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
300    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
301                                                        \
302    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);  /* i1 i0 */  \
303    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);  /* i3 i2 */  \
304    in2 = _mm_unpacklo_epi32(tr0_2, tr0_3);  /* i5 i4 */  \
305    in3 = _mm_unpackhi_epi32(tr0_2, tr0_3);  /* i7 i6 */  \
306  }
307
308// Define Macro for multiplying elements by constants and adding them together.
309#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
310                               cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
311  {   \
312      tmp0 = _mm_madd_epi16(lo_0, cst0); \
313      tmp1 = _mm_madd_epi16(hi_0, cst0); \
314      tmp2 = _mm_madd_epi16(lo_0, cst1); \
315      tmp3 = _mm_madd_epi16(hi_0, cst1); \
316      tmp4 = _mm_madd_epi16(lo_1, cst2); \
317      tmp5 = _mm_madd_epi16(hi_1, cst2); \
318      tmp6 = _mm_madd_epi16(lo_1, cst3); \
319      tmp7 = _mm_madd_epi16(hi_1, cst3); \
320      \
321      tmp0 = _mm_add_epi32(tmp0, rounding); \
322      tmp1 = _mm_add_epi32(tmp1, rounding); \
323      tmp2 = _mm_add_epi32(tmp2, rounding); \
324      tmp3 = _mm_add_epi32(tmp3, rounding); \
325      tmp4 = _mm_add_epi32(tmp4, rounding); \
326      tmp5 = _mm_add_epi32(tmp5, rounding); \
327      tmp6 = _mm_add_epi32(tmp6, rounding); \
328      tmp7 = _mm_add_epi32(tmp7, rounding); \
329      \
330      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
331      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
332      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
333      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
334      tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
335      tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
336      tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
337      tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
338      \
339      res0 = _mm_packs_epi32(tmp0, tmp1); \
340      res1 = _mm_packs_epi32(tmp2, tmp3); \
341      res2 = _mm_packs_epi32(tmp4, tmp5); \
342      res3 = _mm_packs_epi32(tmp6, tmp7); \
343  }
344
345#define IDCT8x8_1D  \
346  /* Stage1 */      \
347  { \
348    const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
349    const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
350    const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
351    const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
352    \
353    MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
354                          stg1_1, stg1_2, stg1_3, stp1_4,      \
355                          stp1_7, stp1_5, stp1_6)              \
356  } \
357    \
358  /* Stage2 */ \
359  { \
360    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
361    const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
362    const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
363    const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
364    \
365    MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
366                           stg2_1, stg2_2, stg2_3, stp2_0,     \
367                           stp2_1, stp2_2, stp2_3)             \
368    \
369    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
370    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
371    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
372    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
373  } \
374    \
375  /* Stage3 */ \
376  { \
377    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
378    const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
379    \
380    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
381    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
382    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
383    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
384    \
385    tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
386    tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
387    tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
388    tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
389    \
390    tmp0 = _mm_add_epi32(tmp0, rounding); \
391    tmp1 = _mm_add_epi32(tmp1, rounding); \
392    tmp2 = _mm_add_epi32(tmp2, rounding); \
393    tmp3 = _mm_add_epi32(tmp3, rounding); \
394    \
395    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
396    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
397    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
398    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
399    \
400    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
401    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
402  } \
403  \
404  /* Stage4  */ \
405  in0 = _mm_adds_epi16(stp1_0, stp2_7); \
406  in1 = _mm_adds_epi16(stp1_1, stp1_6); \
407  in2 = _mm_adds_epi16(stp1_2, stp1_5); \
408  in3 = _mm_adds_epi16(stp1_3, stp2_4); \
409  in4 = _mm_subs_epi16(stp1_3, stp2_4); \
410  in5 = _mm_subs_epi16(stp1_2, stp1_5); \
411  in6 = _mm_subs_epi16(stp1_1, stp1_6); \
412  in7 = _mm_subs_epi16(stp1_0, stp2_7);
413
414#define RECON_AND_STORE(dest, in_x) \
415  {                                                     \
416     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
417      d0 = _mm_unpacklo_epi8(d0, zero); \
418      in_x = _mm_add_epi16(in_x, d0); \
419      in_x = _mm_packus_epi16(in_x, in_x); \
420      _mm_storel_epi64((__m128i *)(dest), in_x); \
421      dest += stride; \
422  }
423
424void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
425  const __m128i zero = _mm_setzero_si128();
426  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
427  const __m128i final_rounding = _mm_set1_epi16(1<<4);
428  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
429  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
430  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
431  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
432  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
433  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
434  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
435  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
436
437  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
438  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
439  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
440  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
441  int i;
442
443  // Load input data.
444  in0 = _mm_load_si128((__m128i *)input);
445  in1 = _mm_load_si128((__m128i *)(input + 8 * 1));
446  in2 = _mm_load_si128((__m128i *)(input + 8 * 2));
447  in3 = _mm_load_si128((__m128i *)(input + 8 * 3));
448  in4 = _mm_load_si128((__m128i *)(input + 8 * 4));
449  in5 = _mm_load_si128((__m128i *)(input + 8 * 5));
450  in6 = _mm_load_si128((__m128i *)(input + 8 * 6));
451  in7 = _mm_load_si128((__m128i *)(input + 8 * 7));
452
453  // 2-D
454  for (i = 0; i < 2; i++) {
455    // 8x8 Transpose is copied from vp9_short_fdct8x8_sse2()
456    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
457                  in4, in5, in6, in7);
458
459    // 4-stage 1D idct8x8
460    IDCT8x8_1D
461  }
462
463  // Final rounding and shift
464  in0 = _mm_adds_epi16(in0, final_rounding);
465  in1 = _mm_adds_epi16(in1, final_rounding);
466  in2 = _mm_adds_epi16(in2, final_rounding);
467  in3 = _mm_adds_epi16(in3, final_rounding);
468  in4 = _mm_adds_epi16(in4, final_rounding);
469  in5 = _mm_adds_epi16(in5, final_rounding);
470  in6 = _mm_adds_epi16(in6, final_rounding);
471  in7 = _mm_adds_epi16(in7, final_rounding);
472
473  in0 = _mm_srai_epi16(in0, 5);
474  in1 = _mm_srai_epi16(in1, 5);
475  in2 = _mm_srai_epi16(in2, 5);
476  in3 = _mm_srai_epi16(in3, 5);
477  in4 = _mm_srai_epi16(in4, 5);
478  in5 = _mm_srai_epi16(in5, 5);
479  in6 = _mm_srai_epi16(in6, 5);
480  in7 = _mm_srai_epi16(in7, 5);
481
482  RECON_AND_STORE(dest, in0);
483  RECON_AND_STORE(dest, in1);
484  RECON_AND_STORE(dest, in2);
485  RECON_AND_STORE(dest, in3);
486  RECON_AND_STORE(dest, in4);
487  RECON_AND_STORE(dest, in5);
488  RECON_AND_STORE(dest, in6);
489  RECON_AND_STORE(dest, in7);
490}
491
492void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
493  const __m128i zero = _mm_setzero_si128();
494  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
495  const __m128i final_rounding = _mm_set1_epi16(1<<4);
496  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
497  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
498  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
499  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
500  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
501  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
502  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
503  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
504  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
505
506  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
507  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
508  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
509  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
510
511  // Rows. Load 4-row input data.
512  in0 = _mm_load_si128((__m128i *)input);
513  in1 = _mm_load_si128((__m128i *)(input + 8 * 1));
514  in2 = _mm_load_si128((__m128i *)(input + 8 * 2));
515  in3 = _mm_load_si128((__m128i *)(input + 8 * 3));
516
517  // 8x4 Transpose
518  TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3)
519
520  // Stage1
521  {
522    const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3);
523    const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2);
524
525    tmp0 = _mm_madd_epi16(lo_17, stg1_0);
526    tmp2 = _mm_madd_epi16(lo_17, stg1_1);
527    tmp4 = _mm_madd_epi16(lo_35, stg1_2);
528    tmp6 = _mm_madd_epi16(lo_35, stg1_3);
529
530    tmp0 = _mm_add_epi32(tmp0, rounding);
531    tmp2 = _mm_add_epi32(tmp2, rounding);
532    tmp4 = _mm_add_epi32(tmp4, rounding);
533    tmp6 = _mm_add_epi32(tmp6, rounding);
534    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
535    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
536    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
537    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
538
539    stp1_4 = _mm_packs_epi32(tmp0, zero);
540    stp1_7 = _mm_packs_epi32(tmp2, zero);
541    stp1_5 = _mm_packs_epi32(tmp4, zero);
542    stp1_6 = _mm_packs_epi32(tmp6, zero);
543  }
544
545  // Stage2
546  {
547    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2);
548    const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3);
549
550    tmp0 = _mm_madd_epi16(lo_04, stg2_0);
551    tmp2 = _mm_madd_epi16(lo_04, stg2_1);
552    tmp4 = _mm_madd_epi16(lo_26, stg2_2);
553    tmp6 = _mm_madd_epi16(lo_26, stg2_3);
554
555    tmp0 = _mm_add_epi32(tmp0, rounding);
556    tmp2 = _mm_add_epi32(tmp2, rounding);
557    tmp4 = _mm_add_epi32(tmp4, rounding);
558    tmp6 = _mm_add_epi32(tmp6, rounding);
559    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
560    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
561    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
562    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
563
564    stp2_0 = _mm_packs_epi32(tmp0, zero);
565    stp2_1 = _mm_packs_epi32(tmp2, zero);
566    stp2_2 = _mm_packs_epi32(tmp4, zero);
567    stp2_3 = _mm_packs_epi32(tmp6, zero);
568
569    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5);
570    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5);
571    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6);
572    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6);
573  }
574
575  // Stage3
576  {
577    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
578    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3);
579    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2);
580    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2);
581    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3);
582
583    tmp0 = _mm_madd_epi16(lo_56, stg3_0);
584    tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
585
586    tmp0 = _mm_add_epi32(tmp0, rounding);
587    tmp2 = _mm_add_epi32(tmp2, rounding);
588    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
589    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
590
591    stp1_5 = _mm_packs_epi32(tmp0, zero);
592    stp1_6 = _mm_packs_epi32(tmp2, zero);
593  }
594
595  // Stage4
596  in0 = _mm_adds_epi16(stp1_0, stp2_7);
597  in1 = _mm_adds_epi16(stp1_1, stp1_6);
598  in2 = _mm_adds_epi16(stp1_2, stp1_5);
599  in3 = _mm_adds_epi16(stp1_3, stp2_4);
600  in4 = _mm_subs_epi16(stp1_3, stp2_4);
601  in5 = _mm_subs_epi16(stp1_2, stp1_5);
602  in6 = _mm_subs_epi16(stp1_1, stp1_6);
603  in7 = _mm_subs_epi16(stp1_0, stp2_7);
604
605  // Columns. 4x8 Transpose
606  TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
607                in4, in5, in6, in7)
608
609  // 1D idct8x8
610  IDCT8x8_1D
611
612  // Final rounding and shift
613  in0 = _mm_adds_epi16(in0, final_rounding);
614  in1 = _mm_adds_epi16(in1, final_rounding);
615  in2 = _mm_adds_epi16(in2, final_rounding);
616  in3 = _mm_adds_epi16(in3, final_rounding);
617  in4 = _mm_adds_epi16(in4, final_rounding);
618  in5 = _mm_adds_epi16(in5, final_rounding);
619  in6 = _mm_adds_epi16(in6, final_rounding);
620  in7 = _mm_adds_epi16(in7, final_rounding);
621
622  in0 = _mm_srai_epi16(in0, 5);
623  in1 = _mm_srai_epi16(in1, 5);
624  in2 = _mm_srai_epi16(in2, 5);
625  in3 = _mm_srai_epi16(in3, 5);
626  in4 = _mm_srai_epi16(in4, 5);
627  in5 = _mm_srai_epi16(in5, 5);
628  in6 = _mm_srai_epi16(in6, 5);
629  in7 = _mm_srai_epi16(in7, 5);
630
631  RECON_AND_STORE(dest, in0);
632  RECON_AND_STORE(dest, in1);
633  RECON_AND_STORE(dest, in2);
634  RECON_AND_STORE(dest, in3);
635  RECON_AND_STORE(dest, in4);
636  RECON_AND_STORE(dest, in5);
637  RECON_AND_STORE(dest, in6);
638  RECON_AND_STORE(dest, in7);
639}
640
641#define IDCT16x16_1D \
642  /* Stage2 */ \
643  { \
644    const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \
645    const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \
646    const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7);   \
647    const __m128i hi_9_7 = _mm_unpackhi_epi16(in9, in7);   \
648    const __m128i lo_5_11 = _mm_unpacklo_epi16(in5, in11); \
649    const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \
650    const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \
651    const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \
652    \
653    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
654                           stg2_0, stg2_1, stg2_2, stg2_3, \
655                           stp2_8, stp2_15, stp2_9, stp2_14) \
656    \
657    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
658                           stg2_4, stg2_5, stg2_6, stg2_7, \
659                           stp2_10, stp2_13, stp2_11, stp2_12) \
660  } \
661    \
662  /* Stage3 */ \
663  { \
664    const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \
665    const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \
666    const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \
667    const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \
668    \
669    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
670                           stg3_0, stg3_1, stg3_2, stg3_3, \
671                           stp1_4, stp1_7, stp1_5, stp1_6) \
672    \
673    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);  \
674    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);    \
675    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
676    stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
677    \
678    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
679    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
680    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
681    stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
682  } \
683  \
684  /* Stage4 */ \
685  { \
686    const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \
687    const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \
688    const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \
689    const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \
690    \
691    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
692    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
693    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
694    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
695    \
696    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
697                           stg4_0, stg4_1, stg4_2, stg4_3, \
698                           stp2_0, stp2_1, stp2_2, stp2_3) \
699    \
700    stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
701    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
702    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
703    stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
704    \
705    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
706                           stg4_4, stg4_5, stg4_6, stg4_7, \
707                           stp2_9, stp2_14, stp2_10, stp2_13) \
708  } \
709    \
710  /* Stage5 */ \
711  { \
712    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
713    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
714    \
715    stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
716    stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
717    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
718    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
719    \
720    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
721    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
722    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
723    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
724    \
725    tmp0 = _mm_add_epi32(tmp0, rounding); \
726    tmp1 = _mm_add_epi32(tmp1, rounding); \
727    tmp2 = _mm_add_epi32(tmp2, rounding); \
728    tmp3 = _mm_add_epi32(tmp3, rounding); \
729    \
730    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
731    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
732    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
733    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
734    \
735    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
736    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
737    \
738    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
739    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
740    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
741    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
742    \
743    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
744    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
745    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
746    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
747  } \
748    \
749  /* Stage6 */ \
750  { \
751    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
752    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
753    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
754    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
755    \
756    stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
757    stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
758    stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
759    stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
760    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
761    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
762    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
763    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
764    \
765    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
766                           stg6_0, stg4_0, stg6_0, stg4_0, \
767                           stp2_10, stp2_13, stp2_11, stp2_12) \
768  }
769
770void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
771  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
772  const __m128i final_rounding = _mm_set1_epi16(1<<5);
773  const __m128i zero = _mm_setzero_si128();
774
775  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
776  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
777  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
778  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
779  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
780  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
781  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
782  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
783
784  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
785  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
786  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
787  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
788
789  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
790  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
791  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
792  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
793  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
794  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
795  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
796  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
797
798  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
799
800  __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero,
801          in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero,
802          in10 = zero, in11 = zero, in12 = zero, in13 = zero,
803          in14 = zero, in15 = zero;
804  __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero,
805          l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero,
806          l12 = zero, l13 = zero, l14 = zero, l15 = zero;
807  __m128i r0 = zero, r1 = zero, r2 = zero, r3 = zero, r4 = zero, r5 = zero,
808          r6 = zero, r7 = zero, r8 = zero, r9 = zero, r10 = zero, r11 = zero,
809          r12 = zero, r13 = zero, r14 = zero, r15 = zero;
810  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
811          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
812          stp1_8_0, stp1_12_0;
813  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
814          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
815  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
816  int i;
817
818  // We work on a 8x16 block each time, and loop 4 times for 2-D 16x16 idct.
819  for (i = 0; i < 4; i++) {
820    // 1-D idct
821    if (i < 2) {
822      if (i == 1) input += 128;
823
824      // Load input data.
825      in0 = _mm_load_si128((__m128i *)input);
826      in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
827      in1 = _mm_load_si128((__m128i *)(input + 8 * 2));
828      in9 = _mm_load_si128((__m128i *)(input + 8 * 3));
829      in2 = _mm_load_si128((__m128i *)(input + 8 * 4));
830      in10 = _mm_load_si128((__m128i *)(input + 8 * 5));
831      in3 = _mm_load_si128((__m128i *)(input + 8 * 6));
832      in11 = _mm_load_si128((__m128i *)(input + 8 * 7));
833      in4 = _mm_load_si128((__m128i *)(input + 8 * 8));
834      in12 = _mm_load_si128((__m128i *)(input + 8 * 9));
835      in5 = _mm_load_si128((__m128i *)(input + 8 * 10));
836      in13 = _mm_load_si128((__m128i *)(input + 8 * 11));
837      in6 = _mm_load_si128((__m128i *)(input + 8 * 12));
838      in14 = _mm_load_si128((__m128i *)(input + 8 * 13));
839      in7 = _mm_load_si128((__m128i *)(input + 8 * 14));
840      in15 = _mm_load_si128((__m128i *)(input + 8 * 15));
841
842      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
843                    in4, in5, in6, in7);
844      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
845                    in10, in11, in12, in13, in14, in15);
846    }
847
848    if (i == 2) {
849      TRANSPOSE_8X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4,
850                    in5, in6, in7);
851      TRANSPOSE_8X8(r0, r1, r2, r3, r4, r5, r6, r7, in8, in9, in10, in11, in12,
852                    in13, in14, in15);
853    }
854
855    if (i == 3) {
856      TRANSPOSE_8X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3,
857                    in4, in5, in6, in7);
858      TRANSPOSE_8X8(r8, r9, r10, r11, r12, r13, r14, r15, in8, in9, in10, in11,
859                    in12, in13, in14, in15);
860    }
861
862    IDCT16x16_1D
863
864    // Stage7
865    if (i == 0) {
866      // Left 8x16
867      l0 = _mm_add_epi16(stp2_0, stp1_15);
868      l1 = _mm_add_epi16(stp2_1, stp1_14);
869      l2 = _mm_add_epi16(stp2_2, stp2_13);
870      l3 = _mm_add_epi16(stp2_3, stp2_12);
871      l4 = _mm_add_epi16(stp2_4, stp2_11);
872      l5 = _mm_add_epi16(stp2_5, stp2_10);
873      l6 = _mm_add_epi16(stp2_6, stp1_9);
874      l7 = _mm_add_epi16(stp2_7, stp1_8);
875      l8 = _mm_sub_epi16(stp2_7, stp1_8);
876      l9 = _mm_sub_epi16(stp2_6, stp1_9);
877      l10 = _mm_sub_epi16(stp2_5, stp2_10);
878      l11 = _mm_sub_epi16(stp2_4, stp2_11);
879      l12 = _mm_sub_epi16(stp2_3, stp2_12);
880      l13 = _mm_sub_epi16(stp2_2, stp2_13);
881      l14 = _mm_sub_epi16(stp2_1, stp1_14);
882      l15 = _mm_sub_epi16(stp2_0, stp1_15);
883    } else if (i == 1) {
884      // Right 8x16
885      r0 = _mm_add_epi16(stp2_0, stp1_15);
886      r1 = _mm_add_epi16(stp2_1, stp1_14);
887      r2 = _mm_add_epi16(stp2_2, stp2_13);
888      r3 = _mm_add_epi16(stp2_3, stp2_12);
889      r4 = _mm_add_epi16(stp2_4, stp2_11);
890      r5 = _mm_add_epi16(stp2_5, stp2_10);
891      r6 = _mm_add_epi16(stp2_6, stp1_9);
892      r7 = _mm_add_epi16(stp2_7, stp1_8);
893      r8 = _mm_sub_epi16(stp2_7, stp1_8);
894      r9 = _mm_sub_epi16(stp2_6, stp1_9);
895      r10 = _mm_sub_epi16(stp2_5, stp2_10);
896      r11 = _mm_sub_epi16(stp2_4, stp2_11);
897      r12 = _mm_sub_epi16(stp2_3, stp2_12);
898      r13 = _mm_sub_epi16(stp2_2, stp2_13);
899      r14 = _mm_sub_epi16(stp2_1, stp1_14);
900      r15 = _mm_sub_epi16(stp2_0, stp1_15);
901    } else {
902      // 2-D
903      in0 = _mm_add_epi16(stp2_0, stp1_15);
904      in1 = _mm_add_epi16(stp2_1, stp1_14);
905      in2 = _mm_add_epi16(stp2_2, stp2_13);
906      in3 = _mm_add_epi16(stp2_3, stp2_12);
907      in4 = _mm_add_epi16(stp2_4, stp2_11);
908      in5 = _mm_add_epi16(stp2_5, stp2_10);
909      in6 = _mm_add_epi16(stp2_6, stp1_9);
910      in7 = _mm_add_epi16(stp2_7, stp1_8);
911      in8 = _mm_sub_epi16(stp2_7, stp1_8);
912      in9 = _mm_sub_epi16(stp2_6, stp1_9);
913      in10 = _mm_sub_epi16(stp2_5, stp2_10);
914      in11 = _mm_sub_epi16(stp2_4, stp2_11);
915      in12 = _mm_sub_epi16(stp2_3, stp2_12);
916      in13 = _mm_sub_epi16(stp2_2, stp2_13);
917      in14 = _mm_sub_epi16(stp2_1, stp1_14);
918      in15 = _mm_sub_epi16(stp2_0, stp1_15);
919
920      // Final rounding and shift
921      in0 = _mm_adds_epi16(in0, final_rounding);
922      in1 = _mm_adds_epi16(in1, final_rounding);
923      in2 = _mm_adds_epi16(in2, final_rounding);
924      in3 = _mm_adds_epi16(in3, final_rounding);
925      in4 = _mm_adds_epi16(in4, final_rounding);
926      in5 = _mm_adds_epi16(in5, final_rounding);
927      in6 = _mm_adds_epi16(in6, final_rounding);
928      in7 = _mm_adds_epi16(in7, final_rounding);
929      in8 = _mm_adds_epi16(in8, final_rounding);
930      in9 = _mm_adds_epi16(in9, final_rounding);
931      in10 = _mm_adds_epi16(in10, final_rounding);
932      in11 = _mm_adds_epi16(in11, final_rounding);
933      in12 = _mm_adds_epi16(in12, final_rounding);
934      in13 = _mm_adds_epi16(in13, final_rounding);
935      in14 = _mm_adds_epi16(in14, final_rounding);
936      in15 = _mm_adds_epi16(in15, final_rounding);
937
938      in0 = _mm_srai_epi16(in0, 6);
939      in1 = _mm_srai_epi16(in1, 6);
940      in2 = _mm_srai_epi16(in2, 6);
941      in3 = _mm_srai_epi16(in3, 6);
942      in4 = _mm_srai_epi16(in4, 6);
943      in5 = _mm_srai_epi16(in5, 6);
944      in6 = _mm_srai_epi16(in6, 6);
945      in7 = _mm_srai_epi16(in7, 6);
946      in8 = _mm_srai_epi16(in8, 6);
947      in9 = _mm_srai_epi16(in9, 6);
948      in10 = _mm_srai_epi16(in10, 6);
949      in11 = _mm_srai_epi16(in11, 6);
950      in12 = _mm_srai_epi16(in12, 6);
951      in13 = _mm_srai_epi16(in13, 6);
952      in14 = _mm_srai_epi16(in14, 6);
953      in15 = _mm_srai_epi16(in15, 6);
954
955      RECON_AND_STORE(dest, in0);
956      RECON_AND_STORE(dest, in1);
957      RECON_AND_STORE(dest, in2);
958      RECON_AND_STORE(dest, in3);
959      RECON_AND_STORE(dest, in4);
960      RECON_AND_STORE(dest, in5);
961      RECON_AND_STORE(dest, in6);
962      RECON_AND_STORE(dest, in7);
963      RECON_AND_STORE(dest, in8);
964      RECON_AND_STORE(dest, in9);
965      RECON_AND_STORE(dest, in10);
966      RECON_AND_STORE(dest, in11);
967      RECON_AND_STORE(dest, in12);
968      RECON_AND_STORE(dest, in13);
969      RECON_AND_STORE(dest, in14);
970      RECON_AND_STORE(dest, in15);
971
972      dest += 8 - (stride * 16);
973    }
974  }
975}
976
977void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest,
978                                     int stride) {
979  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
980  const __m128i final_rounding = _mm_set1_epi16(1<<5);
981  const __m128i zero = _mm_setzero_si128();
982
983  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
984  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
985  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
986  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
987  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
988  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
989  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
990  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
991
992  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
993  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
994  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
995  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
996
997  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
998  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
999  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1000  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
1001  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1002  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
1003  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1004  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1005
1006  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1007
1008  __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero,
1009          in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero,
1010          in10 = zero, in11 = zero, in12 = zero, in13 = zero,
1011          in14 = zero, in15 = zero;
1012  __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero,
1013          l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero,
1014          l12 = zero, l13 = zero, l14 = zero, l15 = zero;
1015
1016  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
1017          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
1018          stp1_8_0, stp1_12_0;
1019  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
1020          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
1021  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1022  int i;
1023  // 1-D idct. Load input data.
1024  in0 = _mm_load_si128((__m128i *)input);
1025  in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
1026  in1 = _mm_load_si128((__m128i *)(input + 8 * 2));
1027  in9 = _mm_load_si128((__m128i *)(input + 8 * 3));
1028  in2 = _mm_load_si128((__m128i *)(input + 8 * 4));
1029  in10 = _mm_load_si128((__m128i *)(input + 8 * 5));
1030  in3 = _mm_load_si128((__m128i *)(input + 8 * 6));
1031  in11 = _mm_load_si128((__m128i *)(input + 8 * 7));
1032
1033  TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3);
1034  TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11);
1035
1036  // Stage2
1037  {
1038    const __m128i lo_1_15 = _mm_unpackhi_epi16(in0, in11);
1039    const __m128i lo_9_7 = _mm_unpackhi_epi16(in8, in3);
1040    const __m128i lo_5_11 = _mm_unpackhi_epi16(in2, in9);
1041    const __m128i lo_13_3 = _mm_unpackhi_epi16(in10, in1);
1042
1043    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
1044    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
1045    tmp4 = _mm_madd_epi16(lo_9_7, stg2_2);
1046    tmp6 = _mm_madd_epi16(lo_9_7, stg2_3);
1047    tmp1 = _mm_madd_epi16(lo_5_11, stg2_4);
1048    tmp3 = _mm_madd_epi16(lo_5_11, stg2_5);
1049    tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
1050    tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
1051
1052    tmp0 = _mm_add_epi32(tmp0, rounding);
1053    tmp2 = _mm_add_epi32(tmp2, rounding);
1054    tmp4 = _mm_add_epi32(tmp4, rounding);
1055    tmp6 = _mm_add_epi32(tmp6, rounding);
1056    tmp1 = _mm_add_epi32(tmp1, rounding);
1057    tmp3 = _mm_add_epi32(tmp3, rounding);
1058    tmp5 = _mm_add_epi32(tmp5, rounding);
1059    tmp7 = _mm_add_epi32(tmp7, rounding);
1060
1061    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1062    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1063    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
1064    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
1065    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
1066    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
1067    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
1068    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
1069
1070    stp2_8 = _mm_packs_epi32(tmp0, zero);
1071    stp2_15 = _mm_packs_epi32(tmp2, zero);
1072    stp2_9 = _mm_packs_epi32(tmp4, zero);
1073    stp2_14 = _mm_packs_epi32(tmp6, zero);
1074
1075    stp2_10 = _mm_packs_epi32(tmp1, zero);
1076    stp2_13 = _mm_packs_epi32(tmp3, zero);
1077    stp2_11 = _mm_packs_epi32(tmp5, zero);
1078    stp2_12 = _mm_packs_epi32(tmp7, zero);
1079  }
1080
1081  // Stage3
1082  {
1083    const __m128i lo_2_14 = _mm_unpacklo_epi16(in1, in11);
1084    const __m128i lo_10_6 = _mm_unpacklo_epi16(in9, in3);
1085
1086    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
1087    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
1088    tmp4 = _mm_madd_epi16(lo_10_6, stg3_2);
1089    tmp6 = _mm_madd_epi16(lo_10_6, stg3_3);
1090
1091    tmp0 = _mm_add_epi32(tmp0, rounding);
1092    tmp2 = _mm_add_epi32(tmp2, rounding);
1093    tmp4 = _mm_add_epi32(tmp4, rounding);
1094    tmp6 = _mm_add_epi32(tmp6, rounding);
1095
1096    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1097    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1098    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
1099    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
1100
1101    stp1_4 = _mm_packs_epi32(tmp0, zero);
1102    stp1_7 = _mm_packs_epi32(tmp2, zero);
1103    stp1_5 = _mm_packs_epi32(tmp4, zero);
1104    stp1_6 = _mm_packs_epi32(tmp6, zero);
1105
1106    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);
1107    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);
1108    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);
1109    stp1_11 = _mm_add_epi16(stp2_11, stp2_10);
1110
1111    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13);
1112    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);
1113    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);
1114    stp1_15 = _mm_add_epi16(stp2_15, stp2_14);
1115  }
1116
1117  // Stage4
1118  {
1119    const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8);
1120    const __m128i lo_4_12 = _mm_unpacklo_epi16(in2, in10);
1121    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
1122    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
1123
1124    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
1125    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
1126    tmp4 = _mm_madd_epi16(lo_4_12, stg4_2);
1127    tmp6 = _mm_madd_epi16(lo_4_12, stg4_3);
1128    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
1129    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
1130    tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
1131    tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
1132
1133    tmp0 = _mm_add_epi32(tmp0, rounding);
1134    tmp2 = _mm_add_epi32(tmp2, rounding);
1135    tmp4 = _mm_add_epi32(tmp4, rounding);
1136    tmp6 = _mm_add_epi32(tmp6, rounding);
1137    tmp1 = _mm_add_epi32(tmp1, rounding);
1138    tmp3 = _mm_add_epi32(tmp3, rounding);
1139    tmp5 = _mm_add_epi32(tmp5, rounding);
1140    tmp7 = _mm_add_epi32(tmp7, rounding);
1141
1142    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1143    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1144    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
1145    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
1146    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
1147    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
1148    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
1149    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
1150
1151    stp2_0 = _mm_packs_epi32(tmp0, zero);
1152    stp2_1 = _mm_packs_epi32(tmp2, zero);
1153    stp2_2 = _mm_packs_epi32(tmp4, zero);
1154    stp2_3 = _mm_packs_epi32(tmp6, zero);
1155    stp2_9 = _mm_packs_epi32(tmp1, zero);
1156    stp2_14 = _mm_packs_epi32(tmp3, zero);
1157    stp2_10 = _mm_packs_epi32(tmp5, zero);
1158    stp2_13 = _mm_packs_epi32(tmp7, zero);
1159
1160    stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
1161    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
1162    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
1163    stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
1164  }
1165
1166  // Stage5 and Stage6
1167  {
1168    stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
1169    stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
1170    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
1171    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
1172
1173    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);
1174    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
1175    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
1176    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);
1177
1178    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);
1179    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
1180    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
1181    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);
1182  }
1183
1184  // Stage6
1185  {
1186    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);
1187    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
1188    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
1189
1190    tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
1191    tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
1192    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
1193    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
1194    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
1195    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
1196
1197    tmp1 = _mm_add_epi32(tmp1, rounding);
1198    tmp3 = _mm_add_epi32(tmp3, rounding);
1199    tmp0 = _mm_add_epi32(tmp0, rounding);
1200    tmp2 = _mm_add_epi32(tmp2, rounding);
1201    tmp4 = _mm_add_epi32(tmp4, rounding);
1202    tmp6 = _mm_add_epi32(tmp6, rounding);
1203
1204    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
1205    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
1206    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1207    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1208    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
1209    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
1210
1211    stp1_5 = _mm_packs_epi32(tmp1, zero);
1212    stp1_6 = _mm_packs_epi32(tmp3, zero);
1213    stp2_10 = _mm_packs_epi32(tmp0, zero);
1214    stp2_13 = _mm_packs_epi32(tmp2, zero);
1215    stp2_11 = _mm_packs_epi32(tmp4, zero);
1216    stp2_12 = _mm_packs_epi32(tmp6, zero);
1217
1218    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);
1219    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);
1220    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
1221    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);
1222    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);
1223    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
1224    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);
1225    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);
1226  }
1227
1228  // Stage7. Left 8x16 only.
1229  l0 = _mm_add_epi16(stp2_0, stp1_15);
1230  l1 = _mm_add_epi16(stp2_1, stp1_14);
1231  l2 = _mm_add_epi16(stp2_2, stp2_13);
1232  l3 = _mm_add_epi16(stp2_3, stp2_12);
1233  l4 = _mm_add_epi16(stp2_4, stp2_11);
1234  l5 = _mm_add_epi16(stp2_5, stp2_10);
1235  l6 = _mm_add_epi16(stp2_6, stp1_9);
1236  l7 = _mm_add_epi16(stp2_7, stp1_8);
1237  l8 = _mm_sub_epi16(stp2_7, stp1_8);
1238  l9 = _mm_sub_epi16(stp2_6, stp1_9);
1239  l10 = _mm_sub_epi16(stp2_5, stp2_10);
1240  l11 = _mm_sub_epi16(stp2_4, stp2_11);
1241  l12 = _mm_sub_epi16(stp2_3, stp2_12);
1242  l13 = _mm_sub_epi16(stp2_2, stp2_13);
1243  l14 = _mm_sub_epi16(stp2_1, stp1_14);
1244  l15 = _mm_sub_epi16(stp2_0, stp1_15);
1245
1246  // 2-D idct. We do 2 8x16 blocks.
1247  for (i = 0; i < 2; i++) {
1248    if (i == 0)
1249      TRANSPOSE_4X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4,
1250                    in5, in6, in7);
1251
1252    if (i == 1)
1253      TRANSPOSE_4X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3,
1254                    in4, in5, in6, in7);
1255
1256    in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero;
1257
1258    IDCT16x16_1D
1259
1260    // Stage7
1261    in0 = _mm_add_epi16(stp2_0, stp1_15);
1262    in1 = _mm_add_epi16(stp2_1, stp1_14);
1263    in2 = _mm_add_epi16(stp2_2, stp2_13);
1264    in3 = _mm_add_epi16(stp2_3, stp2_12);
1265    in4 = _mm_add_epi16(stp2_4, stp2_11);
1266    in5 = _mm_add_epi16(stp2_5, stp2_10);
1267    in6 = _mm_add_epi16(stp2_6, stp1_9);
1268    in7 = _mm_add_epi16(stp2_7, stp1_8);
1269    in8 = _mm_sub_epi16(stp2_7, stp1_8);
1270    in9 = _mm_sub_epi16(stp2_6, stp1_9);
1271    in10 = _mm_sub_epi16(stp2_5, stp2_10);
1272    in11 = _mm_sub_epi16(stp2_4, stp2_11);
1273    in12 = _mm_sub_epi16(stp2_3, stp2_12);
1274    in13 = _mm_sub_epi16(stp2_2, stp2_13);
1275    in14 = _mm_sub_epi16(stp2_1, stp1_14);
1276    in15 = _mm_sub_epi16(stp2_0, stp1_15);
1277
1278    // Final rounding and shift
1279    in0 = _mm_adds_epi16(in0, final_rounding);
1280    in1 = _mm_adds_epi16(in1, final_rounding);
1281    in2 = _mm_adds_epi16(in2, final_rounding);
1282    in3 = _mm_adds_epi16(in3, final_rounding);
1283    in4 = _mm_adds_epi16(in4, final_rounding);
1284    in5 = _mm_adds_epi16(in5, final_rounding);
1285    in6 = _mm_adds_epi16(in6, final_rounding);
1286    in7 = _mm_adds_epi16(in7, final_rounding);
1287    in8 = _mm_adds_epi16(in8, final_rounding);
1288    in9 = _mm_adds_epi16(in9, final_rounding);
1289    in10 = _mm_adds_epi16(in10, final_rounding);
1290    in11 = _mm_adds_epi16(in11, final_rounding);
1291    in12 = _mm_adds_epi16(in12, final_rounding);
1292    in13 = _mm_adds_epi16(in13, final_rounding);
1293    in14 = _mm_adds_epi16(in14, final_rounding);
1294    in15 = _mm_adds_epi16(in15, final_rounding);
1295
1296    in0 = _mm_srai_epi16(in0, 6);
1297    in1 = _mm_srai_epi16(in1, 6);
1298    in2 = _mm_srai_epi16(in2, 6);
1299    in3 = _mm_srai_epi16(in3, 6);
1300    in4 = _mm_srai_epi16(in4, 6);
1301    in5 = _mm_srai_epi16(in5, 6);
1302    in6 = _mm_srai_epi16(in6, 6);
1303    in7 = _mm_srai_epi16(in7, 6);
1304    in8 = _mm_srai_epi16(in8, 6);
1305    in9 = _mm_srai_epi16(in9, 6);
1306    in10 = _mm_srai_epi16(in10, 6);
1307    in11 = _mm_srai_epi16(in11, 6);
1308    in12 = _mm_srai_epi16(in12, 6);
1309    in13 = _mm_srai_epi16(in13, 6);
1310    in14 = _mm_srai_epi16(in14, 6);
1311    in15 = _mm_srai_epi16(in15, 6);
1312
1313    RECON_AND_STORE(dest, in0);
1314    RECON_AND_STORE(dest, in1);
1315    RECON_AND_STORE(dest, in2);
1316    RECON_AND_STORE(dest, in3);
1317    RECON_AND_STORE(dest, in4);
1318    RECON_AND_STORE(dest, in5);
1319    RECON_AND_STORE(dest, in6);
1320    RECON_AND_STORE(dest, in7);
1321    RECON_AND_STORE(dest, in8);
1322    RECON_AND_STORE(dest, in9);
1323    RECON_AND_STORE(dest, in10);
1324    RECON_AND_STORE(dest, in11);
1325    RECON_AND_STORE(dest, in12);
1326    RECON_AND_STORE(dest, in13);
1327    RECON_AND_STORE(dest, in14);
1328    RECON_AND_STORE(dest, in15);
1329
1330    dest += 8 - (stride * 16);
1331  }
1332}
1333
1334void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
1335  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
1336  const __m128i final_rounding = _mm_set1_epi16(1<<5);
1337
1338  // idct constants for each stage
1339  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
1340  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
1341  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
1342  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
1343  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
1344  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
1345  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
1346  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
1347  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
1348  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
1349  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
1350  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
1351  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
1352  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
1353  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
1354  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
1355
1356  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1357  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
1358  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1359  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
1360  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1361  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
1362  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1363  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
1364
1365  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1366  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
1367  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1368  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
1369  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
1370  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
1371  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
1372  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
1373  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
1374  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
1375
1376  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
1377  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1378  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1379  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
1380  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1381  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
1382  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1383
1384  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1385
1386  __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
1387          in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
1388          in24, in25, in26, in27, in28, in29, in30, in31;
1389  __m128i col[128];
1390  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
1391          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
1392          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
1393          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
1394          stp1_30, stp1_31;
1395  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
1396          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
1397          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
1398          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
1399          stp2_30, stp2_31;
1400  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1401  int i, j;
1402
1403  // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
1404  for (i = 0; i < 8; i++) {
1405    if (i < 4) {
1406      // First 1-D idct
1407      // Load input data.
1408      in0 = _mm_load_si128((__m128i *)input);
1409      in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
1410      in16 = _mm_load_si128((__m128i *)(input + 8 * 2));
1411      in24 = _mm_load_si128((__m128i *)(input + 8 * 3));
1412      in1 = _mm_load_si128((__m128i *)(input + 8 * 4));
1413      in9 = _mm_load_si128((__m128i *)(input + 8 * 5));
1414      in17 = _mm_load_si128((__m128i *)(input + 8 * 6));
1415      in25 = _mm_load_si128((__m128i *)(input + 8 * 7));
1416      in2 = _mm_load_si128((__m128i *)(input + 8 * 8));
1417      in10 = _mm_load_si128((__m128i *)(input + 8 * 9));
1418      in18 = _mm_load_si128((__m128i *)(input + 8 * 10));
1419      in26 = _mm_load_si128((__m128i *)(input + 8 * 11));
1420      in3 = _mm_load_si128((__m128i *)(input + 8 * 12));
1421      in11 = _mm_load_si128((__m128i *)(input + 8 * 13));
1422      in19 = _mm_load_si128((__m128i *)(input + 8 * 14));
1423      in27 = _mm_load_si128((__m128i *)(input + 8 * 15));
1424
1425      in4 = _mm_load_si128((__m128i *)(input + 8 * 16));
1426      in12 = _mm_load_si128((__m128i *)(input + 8 * 17));
1427      in20 = _mm_load_si128((__m128i *)(input + 8 * 18));
1428      in28 = _mm_load_si128((__m128i *)(input + 8 * 19));
1429      in5 = _mm_load_si128((__m128i *)(input + 8 * 20));
1430      in13 = _mm_load_si128((__m128i *)(input + 8 * 21));
1431      in21 = _mm_load_si128((__m128i *)(input + 8 * 22));
1432      in29 = _mm_load_si128((__m128i *)(input + 8 * 23));
1433      in6 = _mm_load_si128((__m128i *)(input + 8 * 24));
1434      in14 = _mm_load_si128((__m128i *)(input + 8 * 25));
1435      in22 = _mm_load_si128((__m128i *)(input + 8 * 26));
1436      in30 = _mm_load_si128((__m128i *)(input + 8 * 27));
1437      in7 = _mm_load_si128((__m128i *)(input + 8 * 28));
1438      in15 = _mm_load_si128((__m128i *)(input + 8 * 29));
1439      in23 = _mm_load_si128((__m128i *)(input + 8 * 30));
1440      in31 = _mm_load_si128((__m128i *)(input + 8 * 31));
1441
1442      input += 256;
1443
1444      // Transpose 32x8 block to 8x32 block
1445      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
1446                    in4, in5, in6, in7);
1447      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
1448                    in10, in11, in12, in13, in14, in15);
1449      TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
1450                    in18, in19, in20, in21, in22, in23);
1451      TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
1452                    in26, in27, in28, in29, in30, in31);
1453    } else {
1454      // Second 1-D idct
1455      j = i - 4;
1456
1457      // Transpose 32x8 block to 8x32 block
1458      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
1459                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
1460                    col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
1461                    in5, in6, in7);
1462      j += 4;
1463      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
1464                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
1465                    col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
1466                    in11, in12, in13, in14, in15);
1467      j += 4;
1468      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
1469                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
1470                    col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
1471                    in19, in20, in21, in22, in23);
1472      j += 4;
1473      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
1474                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
1475                    col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
1476                    in28, in29, in30, in31);
1477    }
1478
1479    // Stage1
1480    {
1481      const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31);
1482      const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31);
1483      const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15);
1484      const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15);
1485
1486      const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23);
1487      const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23);
1488      const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7);
1489      const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7);
1490
1491      const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27);
1492      const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27);
1493      const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11);
1494      const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11);
1495
1496      const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19);
1497      const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19);
1498      const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3);
1499      const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3);
1500
1501      MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0,
1502                             stg1_1, stg1_2, stg1_3, stp1_16, stp1_31,
1503                             stp1_17, stp1_30)
1504      MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4,
1505                             stg1_5, stg1_6, stg1_7, stp1_18, stp1_29,
1506                             stp1_19, stp1_28)
1507      MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8,
1508                             stg1_9, stg1_10, stg1_11, stp1_20, stp1_27,
1509                             stp1_21, stp1_26)
1510      MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12,
1511                             stg1_13, stg1_14, stg1_15, stp1_22, stp1_25,
1512                             stp1_23, stp1_24)
1513    }
1514
1515    // Stage2
1516    {
1517      const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30);
1518      const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30);
1519      const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14);
1520      const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14);
1521
1522      const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22);
1523      const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22);
1524      const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6);
1525      const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6);
1526
1527      MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0,
1528                             stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9,
1529                             stp2_14)
1530      MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4,
1531                             stg2_5, stg2_6, stg2_7, stp2_10, stp2_13,
1532                             stp2_11, stp2_12)
1533
1534      stp2_16 = _mm_add_epi16(stp1_16, stp1_17);
1535      stp2_17 = _mm_sub_epi16(stp1_16, stp1_17);
1536      stp2_18 = _mm_sub_epi16(stp1_19, stp1_18);
1537      stp2_19 = _mm_add_epi16(stp1_19, stp1_18);
1538
1539      stp2_20 = _mm_add_epi16(stp1_20, stp1_21);
1540      stp2_21 = _mm_sub_epi16(stp1_20, stp1_21);
1541      stp2_22 = _mm_sub_epi16(stp1_23, stp1_22);
1542      stp2_23 = _mm_add_epi16(stp1_23, stp1_22);
1543
1544      stp2_24 = _mm_add_epi16(stp1_24, stp1_25);
1545      stp2_25 = _mm_sub_epi16(stp1_24, stp1_25);
1546      stp2_26 = _mm_sub_epi16(stp1_27, stp1_26);
1547      stp2_27 = _mm_add_epi16(stp1_27, stp1_26);
1548
1549      stp2_28 = _mm_add_epi16(stp1_28, stp1_29);
1550      stp2_29 = _mm_sub_epi16(stp1_28, stp1_29);
1551      stp2_30 = _mm_sub_epi16(stp1_31, stp1_30);
1552      stp2_31 = _mm_add_epi16(stp1_31, stp1_30);
1553    }
1554
1555    // Stage3
1556    {
1557      const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28);
1558      const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28);
1559      const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12);
1560      const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12);
1561
1562      const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30);
1563      const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30);
1564      const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);
1565      const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);
1566
1567      const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
1568      const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
1569      const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);
1570      const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);
1571
1572      MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0,
1573                             stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5,
1574                             stp1_6)
1575
1576      stp1_8 = _mm_add_epi16(stp2_8, stp2_9);
1577      stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);
1578      stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);
1579      stp1_11 = _mm_add_epi16(stp2_11, stp2_10);
1580      stp1_12 = _mm_add_epi16(stp2_12, stp2_13);
1581      stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);
1582      stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);
1583      stp1_15 = _mm_add_epi16(stp2_15, stp2_14);
1584
1585      MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,
1586                             stg3_5, stg3_6, stg3_4, stp1_17, stp1_30,
1587                             stp1_18, stp1_29)
1588      MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,
1589                             stg3_9, stg3_10, stg3_8, stp1_21, stp1_26,
1590                             stp1_22, stp1_25)
1591
1592      stp1_16 = stp2_16;
1593      stp1_31 = stp2_31;
1594      stp1_19 = stp2_19;
1595      stp1_20 = stp2_20;
1596      stp1_23 = stp2_23;
1597      stp1_24 = stp2_24;
1598      stp1_27 = stp2_27;
1599      stp1_28 = stp2_28;
1600    }
1601
1602    // Stage4
1603    {
1604      const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16);
1605      const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16);
1606      const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24);
1607      const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24);
1608
1609      const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
1610      const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);
1611      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
1612      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
1613
1614      MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0,
1615                             stg4_1, stg4_2, stg4_3, stp2_0, stp2_1,
1616                             stp2_2, stp2_3)
1617
1618      stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
1619      stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
1620      stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
1621      stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
1622
1623      MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,
1624                             stg4_5, stg4_6, stg4_4, stp2_9, stp2_14,
1625                             stp2_10, stp2_13)
1626
1627      stp2_8 = stp1_8;
1628      stp2_15 = stp1_15;
1629      stp2_11 = stp1_11;
1630      stp2_12 = stp1_12;
1631
1632      stp2_16 = _mm_add_epi16(stp1_16, stp1_19);
1633      stp2_17 = _mm_add_epi16(stp1_17, stp1_18);
1634      stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);
1635      stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);
1636      stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);
1637      stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);
1638      stp2_22 = _mm_add_epi16(stp1_22, stp1_21);
1639      stp2_23 = _mm_add_epi16(stp1_23, stp1_20);
1640
1641      stp2_24 = _mm_add_epi16(stp1_24, stp1_27);
1642      stp2_25 = _mm_add_epi16(stp1_25, stp1_26);
1643      stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);
1644      stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);
1645      stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);
1646      stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);
1647      stp2_30 = _mm_add_epi16(stp1_29, stp1_30);
1648      stp2_31 = _mm_add_epi16(stp1_28, stp1_31);
1649    }
1650
1651    // Stage5
1652    {
1653      const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);
1654      const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);
1655      const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);
1656      const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);
1657
1658      const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);
1659      const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);
1660      const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);
1661      const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);
1662
1663      const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
1664      const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
1665
1666      stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
1667      stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
1668      stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
1669      stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
1670
1671      tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);
1672      tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);
1673      tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);
1674      tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);
1675
1676      tmp0 = _mm_add_epi32(tmp0, rounding);
1677      tmp1 = _mm_add_epi32(tmp1, rounding);
1678      tmp2 = _mm_add_epi32(tmp2, rounding);
1679      tmp3 = _mm_add_epi32(tmp3, rounding);
1680
1681      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1682      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
1683      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1684      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
1685
1686      stp1_5 = _mm_packs_epi32(tmp0, tmp1);
1687      stp1_6 = _mm_packs_epi32(tmp2, tmp3);
1688
1689      stp1_4 = stp2_4;
1690      stp1_7 = stp2_7;
1691
1692      stp1_8 = _mm_add_epi16(stp2_8, stp2_11);
1693      stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
1694      stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
1695      stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);
1696      stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);
1697      stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
1698      stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
1699      stp1_15 = _mm_add_epi16(stp2_15, stp2_12);
1700
1701      stp1_16 = stp2_16;
1702      stp1_17 = stp2_17;
1703
1704      MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,
1705                             stg4_5, stg4_4, stg4_5, stp1_18, stp1_29,
1706                             stp1_19, stp1_28)
1707      MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,
1708                             stg4_4, stg4_6, stg4_4, stp1_20, stp1_27,
1709                             stp1_21, stp1_26)
1710
1711      stp1_22 = stp2_22;
1712      stp1_23 = stp2_23;
1713      stp1_24 = stp2_24;
1714      stp1_25 = stp2_25;
1715      stp1_30 = stp2_30;
1716      stp1_31 = stp2_31;
1717    }
1718
1719    // Stage6
1720    {
1721      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
1722      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
1723      const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
1724      const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);
1725
1726      stp2_0 = _mm_add_epi16(stp1_0, stp1_7);
1727      stp2_1 = _mm_add_epi16(stp1_1, stp1_6);
1728      stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
1729      stp2_3 = _mm_add_epi16(stp1_3, stp1_4);
1730      stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);
1731      stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
1732      stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);
1733      stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);
1734
1735      stp2_8 = stp1_8;
1736      stp2_9 = stp1_9;
1737      stp2_14 = stp1_14;
1738      stp2_15 = stp1_15;
1739
1740      MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12,
1741                             stg6_0, stg4_0, stg6_0, stg4_0, stp2_10,
1742                             stp2_13, stp2_11, stp2_12)
1743
1744      stp2_16 = _mm_add_epi16(stp1_16, stp1_23);
1745      stp2_17 = _mm_add_epi16(stp1_17, stp1_22);
1746      stp2_18 = _mm_add_epi16(stp1_18, stp1_21);
1747      stp2_19 = _mm_add_epi16(stp1_19, stp1_20);
1748      stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);
1749      stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);
1750      stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);
1751      stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);
1752
1753      stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);
1754      stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);
1755      stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);
1756      stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);
1757      stp2_28 = _mm_add_epi16(stp1_27, stp1_28);
1758      stp2_29 = _mm_add_epi16(stp1_26, stp1_29);
1759      stp2_30 = _mm_add_epi16(stp1_25, stp1_30);
1760      stp2_31 = _mm_add_epi16(stp1_24, stp1_31);
1761    }
1762
1763    // Stage7
1764    {
1765      const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);
1766      const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);
1767      const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
1768      const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
1769
1770      const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);
1771      const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);
1772      const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);
1773      const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);
1774
1775      stp1_0 = _mm_add_epi16(stp2_0, stp2_15);
1776      stp1_1 = _mm_add_epi16(stp2_1, stp2_14);
1777      stp1_2 = _mm_add_epi16(stp2_2, stp2_13);
1778      stp1_3 = _mm_add_epi16(stp2_3, stp2_12);
1779      stp1_4 = _mm_add_epi16(stp2_4, stp2_11);
1780      stp1_5 = _mm_add_epi16(stp2_5, stp2_10);
1781      stp1_6 = _mm_add_epi16(stp2_6, stp2_9);
1782      stp1_7 = _mm_add_epi16(stp2_7, stp2_8);
1783      stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);
1784      stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);
1785      stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);
1786      stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);
1787      stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);
1788      stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);
1789      stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);
1790      stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);
1791
1792      stp1_16 = stp2_16;
1793      stp1_17 = stp2_17;
1794      stp1_18 = stp2_18;
1795      stp1_19 = stp2_19;
1796
1797      MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,
1798                             stg4_0, stg6_0, stg4_0, stp1_20, stp1_27,
1799                             stp1_21, stp1_26)
1800      MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,
1801                             stg4_0, stg6_0, stg4_0, stp1_22, stp1_25,
1802                             stp1_23, stp1_24)
1803
1804      stp1_28 = stp2_28;
1805      stp1_29 = stp2_29;
1806      stp1_30 = stp2_30;
1807      stp1_31 = stp2_31;
1808    }
1809
1810    // final stage
1811    if (i < 4) {
1812      // 1_D: Store 32 intermediate results for each 8x32 block.
1813      col[i * 32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
1814      col[i * 32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
1815      col[i * 32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
1816      col[i * 32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
1817      col[i * 32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
1818      col[i * 32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
1819      col[i * 32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
1820      col[i * 32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
1821      col[i * 32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
1822      col[i * 32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
1823      col[i * 32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
1824      col[i * 32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
1825      col[i * 32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
1826      col[i * 32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
1827      col[i * 32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
1828      col[i * 32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
1829      col[i * 32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
1830      col[i * 32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
1831      col[i * 32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
1832      col[i * 32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
1833      col[i * 32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
1834      col[i * 32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
1835      col[i * 32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
1836      col[i * 32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
1837      col[i * 32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
1838      col[i * 32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
1839      col[i * 32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
1840      col[i * 32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
1841      col[i * 32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
1842      col[i * 32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
1843      col[i * 32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
1844      col[i * 32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
1845    } else {
1846      const __m128i zero = _mm_setzero_si128();
1847
1848      // 2_D: Calculate the results and store them to destination.
1849      in0 = _mm_add_epi16(stp1_0, stp1_31);
1850      in1 = _mm_add_epi16(stp1_1, stp1_30);
1851      in2 = _mm_add_epi16(stp1_2, stp1_29);
1852      in3 = _mm_add_epi16(stp1_3, stp1_28);
1853      in4 = _mm_add_epi16(stp1_4, stp1_27);
1854      in5 = _mm_add_epi16(stp1_5, stp1_26);
1855      in6 = _mm_add_epi16(stp1_6, stp1_25);
1856      in7 = _mm_add_epi16(stp1_7, stp1_24);
1857      in8 = _mm_add_epi16(stp1_8, stp1_23);
1858      in9 = _mm_add_epi16(stp1_9, stp1_22);
1859      in10 = _mm_add_epi16(stp1_10, stp1_21);
1860      in11 = _mm_add_epi16(stp1_11, stp1_20);
1861      in12 = _mm_add_epi16(stp1_12, stp1_19);
1862      in13 = _mm_add_epi16(stp1_13, stp1_18);
1863      in14 = _mm_add_epi16(stp1_14, stp1_17);
1864      in15 = _mm_add_epi16(stp1_15, stp1_16);
1865      in16 = _mm_sub_epi16(stp1_15, stp1_16);
1866      in17 = _mm_sub_epi16(stp1_14, stp1_17);
1867      in18 = _mm_sub_epi16(stp1_13, stp1_18);
1868      in19 = _mm_sub_epi16(stp1_12, stp1_19);
1869      in20 = _mm_sub_epi16(stp1_11, stp1_20);
1870      in21 = _mm_sub_epi16(stp1_10, stp1_21);
1871      in22 = _mm_sub_epi16(stp1_9, stp1_22);
1872      in23 = _mm_sub_epi16(stp1_8, stp1_23);
1873      in24 = _mm_sub_epi16(stp1_7, stp1_24);
1874      in25 = _mm_sub_epi16(stp1_6, stp1_25);
1875      in26 = _mm_sub_epi16(stp1_5, stp1_26);
1876      in27 = _mm_sub_epi16(stp1_4, stp1_27);
1877      in28 = _mm_sub_epi16(stp1_3, stp1_28);
1878      in29 = _mm_sub_epi16(stp1_2, stp1_29);
1879      in30 = _mm_sub_epi16(stp1_1, stp1_30);
1880      in31 = _mm_sub_epi16(stp1_0, stp1_31);
1881
1882      // Final rounding and shift
1883      in0 = _mm_adds_epi16(in0, final_rounding);
1884      in1 = _mm_adds_epi16(in1, final_rounding);
1885      in2 = _mm_adds_epi16(in2, final_rounding);
1886      in3 = _mm_adds_epi16(in3, final_rounding);
1887      in4 = _mm_adds_epi16(in4, final_rounding);
1888      in5 = _mm_adds_epi16(in5, final_rounding);
1889      in6 = _mm_adds_epi16(in6, final_rounding);
1890      in7 = _mm_adds_epi16(in7, final_rounding);
1891      in8 = _mm_adds_epi16(in8, final_rounding);
1892      in9 = _mm_adds_epi16(in9, final_rounding);
1893      in10 = _mm_adds_epi16(in10, final_rounding);
1894      in11 = _mm_adds_epi16(in11, final_rounding);
1895      in12 = _mm_adds_epi16(in12, final_rounding);
1896      in13 = _mm_adds_epi16(in13, final_rounding);
1897      in14 = _mm_adds_epi16(in14, final_rounding);
1898      in15 = _mm_adds_epi16(in15, final_rounding);
1899      in16 = _mm_adds_epi16(in16, final_rounding);
1900      in17 = _mm_adds_epi16(in17, final_rounding);
1901      in18 = _mm_adds_epi16(in18, final_rounding);
1902      in19 = _mm_adds_epi16(in19, final_rounding);
1903      in20 = _mm_adds_epi16(in20, final_rounding);
1904      in21 = _mm_adds_epi16(in21, final_rounding);
1905      in22 = _mm_adds_epi16(in22, final_rounding);
1906      in23 = _mm_adds_epi16(in23, final_rounding);
1907      in24 = _mm_adds_epi16(in24, final_rounding);
1908      in25 = _mm_adds_epi16(in25, final_rounding);
1909      in26 = _mm_adds_epi16(in26, final_rounding);
1910      in27 = _mm_adds_epi16(in27, final_rounding);
1911      in28 = _mm_adds_epi16(in28, final_rounding);
1912      in29 = _mm_adds_epi16(in29, final_rounding);
1913      in30 = _mm_adds_epi16(in30, final_rounding);
1914      in31 = _mm_adds_epi16(in31, final_rounding);
1915
1916      in0 = _mm_srai_epi16(in0, 6);
1917      in1 = _mm_srai_epi16(in1, 6);
1918      in2 = _mm_srai_epi16(in2, 6);
1919      in3 = _mm_srai_epi16(in3, 6);
1920      in4 = _mm_srai_epi16(in4, 6);
1921      in5 = _mm_srai_epi16(in5, 6);
1922      in6 = _mm_srai_epi16(in6, 6);
1923      in7 = _mm_srai_epi16(in7, 6);
1924      in8 = _mm_srai_epi16(in8, 6);
1925      in9 = _mm_srai_epi16(in9, 6);
1926      in10 = _mm_srai_epi16(in10, 6);
1927      in11 = _mm_srai_epi16(in11, 6);
1928      in12 = _mm_srai_epi16(in12, 6);
1929      in13 = _mm_srai_epi16(in13, 6);
1930      in14 = _mm_srai_epi16(in14, 6);
1931      in15 = _mm_srai_epi16(in15, 6);
1932      in16 = _mm_srai_epi16(in16, 6);
1933      in17 = _mm_srai_epi16(in17, 6);
1934      in18 = _mm_srai_epi16(in18, 6);
1935      in19 = _mm_srai_epi16(in19, 6);
1936      in20 = _mm_srai_epi16(in20, 6);
1937      in21 = _mm_srai_epi16(in21, 6);
1938      in22 = _mm_srai_epi16(in22, 6);
1939      in23 = _mm_srai_epi16(in23, 6);
1940      in24 = _mm_srai_epi16(in24, 6);
1941      in25 = _mm_srai_epi16(in25, 6);
1942      in26 = _mm_srai_epi16(in26, 6);
1943      in27 = _mm_srai_epi16(in27, 6);
1944      in28 = _mm_srai_epi16(in28, 6);
1945      in29 = _mm_srai_epi16(in29, 6);
1946      in30 = _mm_srai_epi16(in30, 6);
1947      in31 = _mm_srai_epi16(in31, 6);
1948
1949      RECON_AND_STORE(dest, in0);
1950      RECON_AND_STORE(dest, in1);
1951      RECON_AND_STORE(dest, in2);
1952      RECON_AND_STORE(dest, in3);
1953      RECON_AND_STORE(dest, in4);
1954      RECON_AND_STORE(dest, in5);
1955      RECON_AND_STORE(dest, in6);
1956      RECON_AND_STORE(dest, in7);
1957      RECON_AND_STORE(dest, in8);
1958      RECON_AND_STORE(dest, in9);
1959      RECON_AND_STORE(dest, in10);
1960      RECON_AND_STORE(dest, in11);
1961      RECON_AND_STORE(dest, in12);
1962      RECON_AND_STORE(dest, in13);
1963      RECON_AND_STORE(dest, in14);
1964      RECON_AND_STORE(dest, in15);
1965      RECON_AND_STORE(dest, in16);
1966      RECON_AND_STORE(dest, in17);
1967      RECON_AND_STORE(dest, in18);
1968      RECON_AND_STORE(dest, in19);
1969      RECON_AND_STORE(dest, in20);
1970      RECON_AND_STORE(dest, in21);
1971      RECON_AND_STORE(dest, in22);
1972      RECON_AND_STORE(dest, in23);
1973      RECON_AND_STORE(dest, in24);
1974      RECON_AND_STORE(dest, in25);
1975      RECON_AND_STORE(dest, in26);
1976      RECON_AND_STORE(dest, in27);
1977      RECON_AND_STORE(dest, in28);
1978      RECON_AND_STORE(dest, in29);
1979      RECON_AND_STORE(dest, in30);
1980      RECON_AND_STORE(dest, in31);
1981
1982      dest += 8 - (stride * 32);
1983    }
1984  }
1985}
1986