inv_txfm_sse2.c revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "./vpx_dsp_rtcd.h"
12#include "vpx_dsp/x86/inv_txfm_sse2.h"
13#include "vpx_dsp/x86/txfm_common_sse2.h"
14
15#define RECON_AND_STORE4X4(dest, in_x) \
16{                                                     \
17  __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
18  d0 = _mm_unpacklo_epi8(d0, zero); \
19  d0 = _mm_add_epi16(in_x, d0); \
20  d0 = _mm_packus_epi16(d0, d0); \
21  *(int *)(dest) = _mm_cvtsi128_si32(d0); \
22}
23
24void vpx_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
25  const __m128i zero = _mm_setzero_si128();
26  const __m128i eight = _mm_set1_epi16(8);
27  const __m128i cst = _mm_setr_epi16(
28      (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
29      (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
30      (int16_t)cospi_8_64, (int16_t)cospi_24_64);
31  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
32  __m128i input0, input1, input2, input3;
33
34  // Rows
35  input0 = _mm_load_si128((const __m128i *)input);
36  input2 = _mm_load_si128((const __m128i *)(input + 8));
37
38  // Construct i3, i1, i3, i1, i2, i0, i2, i0
39  input0 = _mm_shufflelo_epi16(input0, 0xd8);
40  input0 = _mm_shufflehi_epi16(input0, 0xd8);
41  input2 = _mm_shufflelo_epi16(input2, 0xd8);
42  input2 = _mm_shufflehi_epi16(input2, 0xd8);
43
44  input1 = _mm_unpackhi_epi32(input0, input0);
45  input0 = _mm_unpacklo_epi32(input0, input0);
46  input3 = _mm_unpackhi_epi32(input2, input2);
47  input2 = _mm_unpacklo_epi32(input2, input2);
48
49  // Stage 1
50  input0 = _mm_madd_epi16(input0, cst);
51  input1 = _mm_madd_epi16(input1, cst);
52  input2 = _mm_madd_epi16(input2, cst);
53  input3 = _mm_madd_epi16(input3, cst);
54
55  input0 = _mm_add_epi32(input0, rounding);
56  input1 = _mm_add_epi32(input1, rounding);
57  input2 = _mm_add_epi32(input2, rounding);
58  input3 = _mm_add_epi32(input3, rounding);
59
60  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
61  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
62  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
63  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
64
65  // Stage 2
66  input0 = _mm_packs_epi32(input0, input1);
67  input1 = _mm_packs_epi32(input2, input3);
68
69  // Transpose
70  input2 = _mm_unpacklo_epi16(input0, input1);
71  input3 = _mm_unpackhi_epi16(input0, input1);
72  input0 = _mm_unpacklo_epi32(input2, input3);
73  input1 = _mm_unpackhi_epi32(input2, input3);
74
75  // Switch column2, column 3, and then, we got:
76  // input2: column1, column 0;  input3: column2, column 3.
77  input1 = _mm_shuffle_epi32(input1, 0x4e);
78  input2 = _mm_add_epi16(input0, input1);
79  input3 = _mm_sub_epi16(input0, input1);
80
81  // Columns
82  // Construct i3, i1, i3, i1, i2, i0, i2, i0
83  input0 = _mm_unpacklo_epi32(input2, input2);
84  input1 = _mm_unpackhi_epi32(input2, input2);
85  input2 = _mm_unpackhi_epi32(input3, input3);
86  input3 = _mm_unpacklo_epi32(input3, input3);
87
88  // Stage 1
89  input0 = _mm_madd_epi16(input0, cst);
90  input1 = _mm_madd_epi16(input1, cst);
91  input2 = _mm_madd_epi16(input2, cst);
92  input3 = _mm_madd_epi16(input3, cst);
93
94  input0 = _mm_add_epi32(input0, rounding);
95  input1 = _mm_add_epi32(input1, rounding);
96  input2 = _mm_add_epi32(input2, rounding);
97  input3 = _mm_add_epi32(input3, rounding);
98
99  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
100  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
101  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
102  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
103
104  // Stage 2
105  input0 = _mm_packs_epi32(input0, input2);
106  input1 = _mm_packs_epi32(input1, input3);
107
108  // Transpose
109  input2 = _mm_unpacklo_epi16(input0, input1);
110  input3 = _mm_unpackhi_epi16(input0, input1);
111  input0 = _mm_unpacklo_epi32(input2, input3);
112  input1 = _mm_unpackhi_epi32(input2, input3);
113
114  // Switch column2, column 3, and then, we got:
115  // input2: column1, column 0;  input3: column2, column 3.
116  input1 = _mm_shuffle_epi32(input1, 0x4e);
117  input2 = _mm_add_epi16(input0, input1);
118  input3 = _mm_sub_epi16(input0, input1);
119
120  // Final round and shift
121  input2 = _mm_add_epi16(input2, eight);
122  input3 = _mm_add_epi16(input3, eight);
123
124  input2 = _mm_srai_epi16(input2, 4);
125  input3 = _mm_srai_epi16(input3, 4);
126
127  // Reconstruction and Store
128  {
129    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
130    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
131    d0 = _mm_unpacklo_epi32(d0,
132                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
133    d2 = _mm_unpacklo_epi32(
134        _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
135    d0 = _mm_unpacklo_epi8(d0, zero);
136    d2 = _mm_unpacklo_epi8(d2, zero);
137    d0 = _mm_add_epi16(d0, input2);
138    d2 = _mm_add_epi16(d2, input3);
139    d0 = _mm_packus_epi16(d0, d2);
140    // store input0
141    *(int *)dest = _mm_cvtsi128_si32(d0);
142    // store input1
143    d0 = _mm_srli_si128(d0, 4);
144    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
145    // store input2
146    d0 = _mm_srli_si128(d0, 4);
147    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
148    // store input3
149    d0 = _mm_srli_si128(d0, 4);
150    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
151  }
152}
153
154void vpx_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
155  __m128i dc_value;
156  const __m128i zero = _mm_setzero_si128();
157  int a;
158
159  a = dct_const_round_shift(input[0] * cospi_16_64);
160  a = dct_const_round_shift(a * cospi_16_64);
161  a = ROUND_POWER_OF_TWO(a, 4);
162
163  dc_value = _mm_set1_epi16(a);
164
165  RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
166  RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
167  RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
168  RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
169}
170
171static INLINE void transpose_4x4(__m128i *res) {
172  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
173  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
174
175  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
176  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
177}
178
179void idct4_sse2(__m128i *in) {
180  const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
181  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
182  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
183  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
184  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
185  __m128i u[8], v[8];
186
187  transpose_4x4(in);
188  // stage 1
189  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
190  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
191  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
192  v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
193  v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
194  v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
195
196  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
197  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
198  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
199  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
200
201  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
202  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
203  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
204  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
205
206  u[0] = _mm_packs_epi32(v[0], v[1]);
207  u[1] = _mm_packs_epi32(v[3], v[2]);
208
209  // stage 2
210  in[0] = _mm_add_epi16(u[0], u[1]);
211  in[1] = _mm_sub_epi16(u[0], u[1]);
212  in[1] = _mm_shuffle_epi32(in[1], 0x4E);
213}
214
215void iadst4_sse2(__m128i *in) {
216  const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
217  const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
218  const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
219  const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
220  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
221  const __m128i kZero = _mm_set1_epi16(0);
222  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
223  __m128i u[8], v[8], in7;
224
225  transpose_4x4(in);
226  in7 = _mm_srli_si128(in[1], 8);
227  in7 = _mm_add_epi16(in7, in[0]);
228  in7 = _mm_sub_epi16(in7, in[1]);
229
230  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
231  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
232  u[2] = _mm_unpacklo_epi16(in7, kZero);
233  u[3] = _mm_unpackhi_epi16(in[0], kZero);
234
235  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
236  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
237  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
238  v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
239  v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
240  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
241
242  u[0] = _mm_add_epi32(v[0], v[1]);
243  u[1] = _mm_add_epi32(v[3], v[4]);
244  u[2] = v[2];
245  u[3] = _mm_add_epi32(u[0], u[1]);
246  u[4] = _mm_slli_epi32(v[5], 2);
247  u[5] = _mm_add_epi32(u[3], v[5]);
248  u[6] = _mm_sub_epi32(u[5], u[4]);
249
250  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
251  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
252  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
253  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
254
255  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
256  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
257  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
258  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
259
260  in[0] = _mm_packs_epi32(u[0], u[1]);
261  in[1] = _mm_packs_epi32(u[2], u[3]);
262}
263
264#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
265                      out0, out1, out2, out3, out4, out5, out6, out7) \
266  {                                                     \
267    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
268    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
269    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
270    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
271    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
272    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
273    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
274    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
275                                                        \
276    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
277    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
278    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
279    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
280    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
281    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
282    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
283    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
284                                                            \
285    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
286    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
287    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
288    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
289    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
290    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
291    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
292    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
293  }
294
295#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \
296                         out0, out1, out2, out3) \
297  {                                              \
298    const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
299    const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
300    const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
301    const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
302    \
303    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
304    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
305    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
306    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
307    \
308    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
309    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
310    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
311    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
312  }
313
314#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
315  {                                            \
316    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
317    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
318    out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
319    out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
320  }
321
322// Define Macro for multiplying elements by constants and adding them together.
323#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
324                               cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
325  {   \
326      tmp0 = _mm_madd_epi16(lo_0, cst0); \
327      tmp1 = _mm_madd_epi16(hi_0, cst0); \
328      tmp2 = _mm_madd_epi16(lo_0, cst1); \
329      tmp3 = _mm_madd_epi16(hi_0, cst1); \
330      tmp4 = _mm_madd_epi16(lo_1, cst2); \
331      tmp5 = _mm_madd_epi16(hi_1, cst2); \
332      tmp6 = _mm_madd_epi16(lo_1, cst3); \
333      tmp7 = _mm_madd_epi16(hi_1, cst3); \
334      \
335      tmp0 = _mm_add_epi32(tmp0, rounding); \
336      tmp1 = _mm_add_epi32(tmp1, rounding); \
337      tmp2 = _mm_add_epi32(tmp2, rounding); \
338      tmp3 = _mm_add_epi32(tmp3, rounding); \
339      tmp4 = _mm_add_epi32(tmp4, rounding); \
340      tmp5 = _mm_add_epi32(tmp5, rounding); \
341      tmp6 = _mm_add_epi32(tmp6, rounding); \
342      tmp7 = _mm_add_epi32(tmp7, rounding); \
343      \
344      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
345      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
346      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
347      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
348      tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
349      tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
350      tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
351      tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
352      \
353      res0 = _mm_packs_epi32(tmp0, tmp1); \
354      res1 = _mm_packs_epi32(tmp2, tmp3); \
355      res2 = _mm_packs_epi32(tmp4, tmp5); \
356      res3 = _mm_packs_epi32(tmp6, tmp7); \
357  }
358
359#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
360  {   \
361      tmp0 = _mm_madd_epi16(lo_0, cst0); \
362      tmp1 = _mm_madd_epi16(hi_0, cst0); \
363      tmp2 = _mm_madd_epi16(lo_0, cst1); \
364      tmp3 = _mm_madd_epi16(hi_0, cst1); \
365      \
366      tmp0 = _mm_add_epi32(tmp0, rounding); \
367      tmp1 = _mm_add_epi32(tmp1, rounding); \
368      tmp2 = _mm_add_epi32(tmp2, rounding); \
369      tmp3 = _mm_add_epi32(tmp3, rounding); \
370      \
371      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
372      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
373      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
374      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
375      \
376      res0 = _mm_packs_epi32(tmp0, tmp1); \
377      res1 = _mm_packs_epi32(tmp2, tmp3); \
378  }
379
380#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
381              out0, out1, out2, out3, out4, out5, out6, out7)  \
382  { \
383  /* Stage1 */      \
384  { \
385    const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
386    const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
387    const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
388    const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
389    \
390    MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
391                          stg1_1, stg1_2, stg1_3, stp1_4,      \
392                          stp1_7, stp1_5, stp1_6)              \
393  } \
394    \
395  /* Stage2 */ \
396  { \
397    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
398    const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
399    const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
400    const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
401    \
402    MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
403                           stg2_1, stg2_2, stg2_3, stp2_0,     \
404                           stp2_1, stp2_2, stp2_3)             \
405    \
406    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
407    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
408    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
409    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
410  } \
411    \
412  /* Stage3 */ \
413  { \
414    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
415    const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
416    \
417    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
418    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
419    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
420    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
421    \
422    tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
423    tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
424    tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
425    tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
426    \
427    tmp0 = _mm_add_epi32(tmp0, rounding); \
428    tmp1 = _mm_add_epi32(tmp1, rounding); \
429    tmp2 = _mm_add_epi32(tmp2, rounding); \
430    tmp3 = _mm_add_epi32(tmp3, rounding); \
431    \
432    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
433    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
434    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
435    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
436    \
437    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
438    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
439  } \
440  \
441  /* Stage4  */ \
442  out0 = _mm_adds_epi16(stp1_0, stp2_7); \
443  out1 = _mm_adds_epi16(stp1_1, stp1_6); \
444  out2 = _mm_adds_epi16(stp1_2, stp1_5); \
445  out3 = _mm_adds_epi16(stp1_3, stp2_4); \
446  out4 = _mm_subs_epi16(stp1_3, stp2_4); \
447  out5 = _mm_subs_epi16(stp1_2, stp1_5); \
448  out6 = _mm_subs_epi16(stp1_1, stp1_6); \
449  out7 = _mm_subs_epi16(stp1_0, stp2_7); \
450  }
451
452void vpx_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
453  const __m128i zero = _mm_setzero_si128();
454  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
455  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
456  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
457  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
458  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
459  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
460  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
461  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
462  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
463  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
464
465  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
466  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
467  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
468  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
469  int i;
470
471  // Load input data.
472  in0 = _mm_load_si128((const __m128i *)input);
473  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
474  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
475  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
476  in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
477  in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
478  in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
479  in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
480
481  // 2-D
482  for (i = 0; i < 2; i++) {
483    // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
484    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,
485                  in0, in1, in2, in3, in4, in5, in6, in7);
486
487    // 4-stage 1D idct8x8
488    IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
489          in0, in1, in2, in3, in4, in5, in6, in7);
490  }
491
492  // Final rounding and shift
493  in0 = _mm_adds_epi16(in0, final_rounding);
494  in1 = _mm_adds_epi16(in1, final_rounding);
495  in2 = _mm_adds_epi16(in2, final_rounding);
496  in3 = _mm_adds_epi16(in3, final_rounding);
497  in4 = _mm_adds_epi16(in4, final_rounding);
498  in5 = _mm_adds_epi16(in5, final_rounding);
499  in6 = _mm_adds_epi16(in6, final_rounding);
500  in7 = _mm_adds_epi16(in7, final_rounding);
501
502  in0 = _mm_srai_epi16(in0, 5);
503  in1 = _mm_srai_epi16(in1, 5);
504  in2 = _mm_srai_epi16(in2, 5);
505  in3 = _mm_srai_epi16(in3, 5);
506  in4 = _mm_srai_epi16(in4, 5);
507  in5 = _mm_srai_epi16(in5, 5);
508  in6 = _mm_srai_epi16(in6, 5);
509  in7 = _mm_srai_epi16(in7, 5);
510
511  RECON_AND_STORE(dest + 0 * stride, in0);
512  RECON_AND_STORE(dest + 1 * stride, in1);
513  RECON_AND_STORE(dest + 2 * stride, in2);
514  RECON_AND_STORE(dest + 3 * stride, in3);
515  RECON_AND_STORE(dest + 4 * stride, in4);
516  RECON_AND_STORE(dest + 5 * stride, in5);
517  RECON_AND_STORE(dest + 6 * stride, in6);
518  RECON_AND_STORE(dest + 7 * stride, in7);
519}
520
521void vpx_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
522  __m128i dc_value;
523  const __m128i zero = _mm_setzero_si128();
524  int a;
525
526  a = dct_const_round_shift(input[0] * cospi_16_64);
527  a = dct_const_round_shift(a * cospi_16_64);
528  a = ROUND_POWER_OF_TWO(a, 5);
529
530  dc_value = _mm_set1_epi16(a);
531
532  RECON_AND_STORE(dest + 0 * stride, dc_value);
533  RECON_AND_STORE(dest + 1 * stride, dc_value);
534  RECON_AND_STORE(dest + 2 * stride, dc_value);
535  RECON_AND_STORE(dest + 3 * stride, dc_value);
536  RECON_AND_STORE(dest + 4 * stride, dc_value);
537  RECON_AND_STORE(dest + 5 * stride, dc_value);
538  RECON_AND_STORE(dest + 6 * stride, dc_value);
539  RECON_AND_STORE(dest + 7 * stride, dc_value);
540}
541
542void idct8_sse2(__m128i *in) {
543  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
544  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
545  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
546  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
547  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
548  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
549  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
550  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
551  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
552
553  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
554  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
555  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
556  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
557
558  // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
559  TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
560                in0, in1, in2, in3, in4, in5, in6, in7);
561
562  // 4-stage 1D idct8x8
563  IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
564        in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
565}
566
567void iadst8_sse2(__m128i *in) {
568  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
569  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
570  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
571  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
572  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
573  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
574  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
575  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
576  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
577  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
578  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
579  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
580  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
581  const __m128i k__const_0 = _mm_set1_epi16(0);
582  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
583
584  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
585  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
586  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
587  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
588  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
589
590  // transpose
591  array_transpose_8x8(in, in);
592
593  // properly aligned for butterfly input
594  in0 = in[7];
595  in1 = in[0];
596  in2 = in[5];
597  in3 = in[2];
598  in4 = in[3];
599  in5 = in[4];
600  in6 = in[1];
601  in7 = in[6];
602
603  // column transformation
604  // stage 1
605  // interleave and multiply/add into 32-bit integer
606  s0 = _mm_unpacklo_epi16(in0, in1);
607  s1 = _mm_unpackhi_epi16(in0, in1);
608  s2 = _mm_unpacklo_epi16(in2, in3);
609  s3 = _mm_unpackhi_epi16(in2, in3);
610  s4 = _mm_unpacklo_epi16(in4, in5);
611  s5 = _mm_unpackhi_epi16(in4, in5);
612  s6 = _mm_unpacklo_epi16(in6, in7);
613  s7 = _mm_unpackhi_epi16(in6, in7);
614
615  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
616  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
617  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
618  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
619  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
620  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
621  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
622  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
623  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
624  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
625  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
626  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
627  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
628  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
629  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
630  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
631
632  // addition
633  w0 = _mm_add_epi32(u0, u8);
634  w1 = _mm_add_epi32(u1, u9);
635  w2 = _mm_add_epi32(u2, u10);
636  w3 = _mm_add_epi32(u3, u11);
637  w4 = _mm_add_epi32(u4, u12);
638  w5 = _mm_add_epi32(u5, u13);
639  w6 = _mm_add_epi32(u6, u14);
640  w7 = _mm_add_epi32(u7, u15);
641  w8 = _mm_sub_epi32(u0, u8);
642  w9 = _mm_sub_epi32(u1, u9);
643  w10 = _mm_sub_epi32(u2, u10);
644  w11 = _mm_sub_epi32(u3, u11);
645  w12 = _mm_sub_epi32(u4, u12);
646  w13 = _mm_sub_epi32(u5, u13);
647  w14 = _mm_sub_epi32(u6, u14);
648  w15 = _mm_sub_epi32(u7, u15);
649
650  // shift and rounding
651  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
652  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
653  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
654  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
655  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
656  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
657  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
658  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
659  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
660  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
661  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
662  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
663  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
664  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
665  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
666  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
667
668  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
669  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
670  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
671  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
672  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
673  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
674  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
675  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
676  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
677  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
678  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
679  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
680  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
681  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
682  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
683  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
684
685  // back to 16-bit and pack 8 integers into __m128i
686  in[0] = _mm_packs_epi32(u0, u1);
687  in[1] = _mm_packs_epi32(u2, u3);
688  in[2] = _mm_packs_epi32(u4, u5);
689  in[3] = _mm_packs_epi32(u6, u7);
690  in[4] = _mm_packs_epi32(u8, u9);
691  in[5] = _mm_packs_epi32(u10, u11);
692  in[6] = _mm_packs_epi32(u12, u13);
693  in[7] = _mm_packs_epi32(u14, u15);
694
695  // stage 2
696  s0 = _mm_add_epi16(in[0], in[2]);
697  s1 = _mm_add_epi16(in[1], in[3]);
698  s2 = _mm_sub_epi16(in[0], in[2]);
699  s3 = _mm_sub_epi16(in[1], in[3]);
700  u0 = _mm_unpacklo_epi16(in[4], in[5]);
701  u1 = _mm_unpackhi_epi16(in[4], in[5]);
702  u2 = _mm_unpacklo_epi16(in[6], in[7]);
703  u3 = _mm_unpackhi_epi16(in[6], in[7]);
704
705  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
706  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
707  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
708  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
709  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
710  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
711  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
712  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
713
714  w0 = _mm_add_epi32(v0, v4);
715  w1 = _mm_add_epi32(v1, v5);
716  w2 = _mm_add_epi32(v2, v6);
717  w3 = _mm_add_epi32(v3, v7);
718  w4 = _mm_sub_epi32(v0, v4);
719  w5 = _mm_sub_epi32(v1, v5);
720  w6 = _mm_sub_epi32(v2, v6);
721  w7 = _mm_sub_epi32(v3, v7);
722
723  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
724  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
725  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
726  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
727  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
728  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
729  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
730  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
731
732  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
733  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
734  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
735  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
736  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
737  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
738  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
739  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
740
741  // back to 16-bit intergers
742  s4 = _mm_packs_epi32(u0, u1);
743  s5 = _mm_packs_epi32(u2, u3);
744  s6 = _mm_packs_epi32(u4, u5);
745  s7 = _mm_packs_epi32(u6, u7);
746
747  // stage 3
748  u0 = _mm_unpacklo_epi16(s2, s3);
749  u1 = _mm_unpackhi_epi16(s2, s3);
750  u2 = _mm_unpacklo_epi16(s6, s7);
751  u3 = _mm_unpackhi_epi16(s6, s7);
752
753  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
754  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
755  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
756  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
757  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
758  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
759  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
760  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
761
762  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
763  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
764  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
765  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
766  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
767  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
768  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
769  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
770
771  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
772  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
773  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
774  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
775  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
776  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
777  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
778  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
779
780  s2 = _mm_packs_epi32(v0, v1);
781  s3 = _mm_packs_epi32(v2, v3);
782  s6 = _mm_packs_epi32(v4, v5);
783  s7 = _mm_packs_epi32(v6, v7);
784
785  in[0] = s0;
786  in[1] = _mm_sub_epi16(k__const_0, s4);
787  in[2] = s6;
788  in[3] = _mm_sub_epi16(k__const_0, s2);
789  in[4] = s3;
790  in[5] = _mm_sub_epi16(k__const_0, s7);
791  in[6] = s5;
792  in[7] = _mm_sub_epi16(k__const_0, s1);
793}
794
795void vpx_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
796  const __m128i zero = _mm_setzero_si128();
797  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
798  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
799  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
800  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
801  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
802  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
803  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
804  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
805  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
806  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
807  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
808
809  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
810  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
811  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
812  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
813
814  // Rows. Load 4-row input data.
815  in0 = _mm_load_si128((const __m128i *)input);
816  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
817  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
818  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
819
820  // 8x4 Transpose
821  TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
822  // Stage1
823  {
824    const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
825    const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
826
827    tmp0 = _mm_madd_epi16(lo_17, stg1_0);
828    tmp2 = _mm_madd_epi16(lo_17, stg1_1);
829    tmp4 = _mm_madd_epi16(lo_35, stg1_2);
830    tmp6 = _mm_madd_epi16(lo_35, stg1_3);
831
832    tmp0 = _mm_add_epi32(tmp0, rounding);
833    tmp2 = _mm_add_epi32(tmp2, rounding);
834    tmp4 = _mm_add_epi32(tmp4, rounding);
835    tmp6 = _mm_add_epi32(tmp6, rounding);
836    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
837    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
838    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
839    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
840
841    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
842    stp1_5 = _mm_packs_epi32(tmp4, tmp6);
843  }
844
845  // Stage2
846  {
847    const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
848    const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
849
850    tmp0 = _mm_madd_epi16(lo_04, stg2_0);
851    tmp2 = _mm_madd_epi16(lo_04, stg2_1);
852    tmp4 = _mm_madd_epi16(lo_26, stg2_2);
853    tmp6 = _mm_madd_epi16(lo_26, stg2_3);
854
855    tmp0 = _mm_add_epi32(tmp0, rounding);
856    tmp2 = _mm_add_epi32(tmp2, rounding);
857    tmp4 = _mm_add_epi32(tmp4, rounding);
858    tmp6 = _mm_add_epi32(tmp6, rounding);
859    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
860    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
861    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
862    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
863
864    stp2_0 = _mm_packs_epi32(tmp0, tmp2);
865    stp2_2 = _mm_packs_epi32(tmp6, tmp4);
866
867    tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
868    tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
869
870    stp2_4 = tmp0;
871    stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
872    stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
873  }
874
875  // Stage3
876  {
877    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
878
879    tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
880    tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
881
882    stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
883    stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
884
885    tmp0 = _mm_madd_epi16(lo_56, stg3_0);
886    tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
887
888    tmp0 = _mm_add_epi32(tmp0, rounding);
889    tmp2 = _mm_add_epi32(tmp2, rounding);
890    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
891    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
892
893    stp1_5 = _mm_packs_epi32(tmp0, tmp2);
894  }
895
896  // Stage4
897  tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
898  tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
899  tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
900  tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
901
902  TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
903
904  IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
905        in0, in1, in2, in3, in4, in5, in6, in7);
906  // Final rounding and shift
907  in0 = _mm_adds_epi16(in0, final_rounding);
908  in1 = _mm_adds_epi16(in1, final_rounding);
909  in2 = _mm_adds_epi16(in2, final_rounding);
910  in3 = _mm_adds_epi16(in3, final_rounding);
911  in4 = _mm_adds_epi16(in4, final_rounding);
912  in5 = _mm_adds_epi16(in5, final_rounding);
913  in6 = _mm_adds_epi16(in6, final_rounding);
914  in7 = _mm_adds_epi16(in7, final_rounding);
915
916  in0 = _mm_srai_epi16(in0, 5);
917  in1 = _mm_srai_epi16(in1, 5);
918  in2 = _mm_srai_epi16(in2, 5);
919  in3 = _mm_srai_epi16(in3, 5);
920  in4 = _mm_srai_epi16(in4, 5);
921  in5 = _mm_srai_epi16(in5, 5);
922  in6 = _mm_srai_epi16(in6, 5);
923  in7 = _mm_srai_epi16(in7, 5);
924
925  RECON_AND_STORE(dest + 0 * stride, in0);
926  RECON_AND_STORE(dest + 1 * stride, in1);
927  RECON_AND_STORE(dest + 2 * stride, in2);
928  RECON_AND_STORE(dest + 3 * stride, in3);
929  RECON_AND_STORE(dest + 4 * stride, in4);
930  RECON_AND_STORE(dest + 5 * stride, in5);
931  RECON_AND_STORE(dest + 6 * stride, in6);
932  RECON_AND_STORE(dest + 7 * stride, in7);
933}
934
935#define IDCT16 \
936  /* Stage2 */ \
937  { \
938    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
939    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
940    const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);   \
941    const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);   \
942    const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
943    const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
944    const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
945    const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
946    \
947    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
948                           stg2_0, stg2_1, stg2_2, stg2_3, \
949                           stp2_8, stp2_15, stp2_9, stp2_14) \
950    \
951    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
952                           stg2_4, stg2_5, stg2_6, stg2_7, \
953                           stp2_10, stp2_13, stp2_11, stp2_12) \
954  } \
955    \
956  /* Stage3 */ \
957  { \
958    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
959    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
960    const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
961    const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
962    \
963    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
964                           stg3_0, stg3_1, stg3_2, stg3_3, \
965                           stp1_4, stp1_7, stp1_5, stp1_6) \
966    \
967    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);  \
968    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);    \
969    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
970    stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
971    \
972    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
973    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
974    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
975    stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
976  } \
977  \
978  /* Stage4 */ \
979  { \
980    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
981    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
982    const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
983    const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
984    \
985    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
986    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
987    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
988    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
989    \
990    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
991                           stg4_0, stg4_1, stg4_2, stg4_3, \
992                           stp2_0, stp2_1, stp2_2, stp2_3) \
993    \
994    stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
995    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
996    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
997    stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
998    \
999    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
1000                           stg4_4, stg4_5, stg4_6, stg4_7, \
1001                           stp2_9, stp2_14, stp2_10, stp2_13) \
1002  } \
1003    \
1004  /* Stage5 */ \
1005  { \
1006    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
1007    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
1008    \
1009    stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
1010    stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
1011    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
1012    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
1013    \
1014    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
1015    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
1016    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
1017    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
1018    \
1019    tmp0 = _mm_add_epi32(tmp0, rounding); \
1020    tmp1 = _mm_add_epi32(tmp1, rounding); \
1021    tmp2 = _mm_add_epi32(tmp2, rounding); \
1022    tmp3 = _mm_add_epi32(tmp3, rounding); \
1023    \
1024    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
1025    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
1026    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
1027    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
1028    \
1029    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
1030    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
1031    \
1032    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
1033    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
1034    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
1035    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
1036    \
1037    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
1038    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
1039    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
1040    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
1041  } \
1042    \
1043  /* Stage6 */ \
1044  { \
1045    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1046    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1047    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
1048    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
1049    \
1050    stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
1051    stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
1052    stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
1053    stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
1054    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
1055    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1056    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1057    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1058    \
1059    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
1060                           stg6_0, stg4_0, stg6_0, stg4_0, \
1061                           stp2_10, stp2_13, stp2_11, stp2_12) \
1062  }
1063
1064#define IDCT16_10 \
1065    /* Stage2 */ \
1066    { \
1067      const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
1068      const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
1069      const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
1070      const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
1071      \
1072      MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \
1073                             stg2_0, stg2_1, stg2_6, stg2_7, \
1074                             stp1_8_0, stp1_15, stp1_11, stp1_12_0) \
1075    } \
1076      \
1077    /* Stage3 */ \
1078    { \
1079      const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
1080      const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
1081      \
1082      MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \
1083                               stg3_0, stg3_1,  \
1084                               stp2_4, stp2_7) \
1085      \
1086      stp1_9  =  stp1_8_0; \
1087      stp1_10 =  stp1_11;  \
1088      \
1089      stp1_13 = stp1_12_0; \
1090      stp1_14 = stp1_15;   \
1091    } \
1092    \
1093    /* Stage4 */ \
1094    { \
1095      const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
1096      const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
1097      \
1098      const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
1099      const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
1100      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1101      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1102      \
1103      MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \
1104                               stg4_0, stg4_1, \
1105                               stp1_0, stp1_1) \
1106      stp2_5 = stp2_4; \
1107      stp2_6 = stp2_7; \
1108      \
1109      MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
1110                             stg4_4, stg4_5, stg4_6, stg4_7, \
1111                             stp2_9, stp2_14, stp2_10, stp2_13) \
1112    } \
1113      \
1114    /* Stage5 */ \
1115    { \
1116      const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
1117      const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
1118      \
1119      stp1_2 = stp1_1; \
1120      stp1_3 = stp1_0; \
1121      \
1122      tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
1123      tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
1124      tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
1125      tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
1126      \
1127      tmp0 = _mm_add_epi32(tmp0, rounding); \
1128      tmp1 = _mm_add_epi32(tmp1, rounding); \
1129      tmp2 = _mm_add_epi32(tmp2, rounding); \
1130      tmp3 = _mm_add_epi32(tmp3, rounding); \
1131      \
1132      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
1133      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
1134      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
1135      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
1136      \
1137      stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
1138      stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
1139      \
1140      stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
1141      stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
1142      stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
1143      stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
1144      \
1145      stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
1146      stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
1147      stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
1148      stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
1149    } \
1150      \
1151    /* Stage6 */ \
1152    { \
1153      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1154      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1155      const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
1156      const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
1157      \
1158      stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
1159      stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
1160      stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
1161      stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
1162      stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
1163      stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1164      stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1165      stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1166      \
1167      MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
1168                             stg6_0, stg4_0, stg6_0, stg4_0, \
1169                             stp2_10, stp2_13, stp2_11, stp2_12) \
1170    }
1171
1172void vpx_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
1173                                int stride) {
1174  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
1175  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
1176  const __m128i zero = _mm_setzero_si128();
1177
1178  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1179  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
1180  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1181  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
1182  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1183  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
1184  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1185  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
1186
1187  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1188  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
1189  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1190  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
1191
1192  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
1193  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1194  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1195  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
1196  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1197  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
1198  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1199  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1200
1201  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1202
1203  __m128i in[16], l[16], r[16], *curr1;
1204  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
1205          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
1206          stp1_8_0, stp1_12_0;
1207  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
1208          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
1209  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1210  int i;
1211
1212  curr1 = l;
1213  for (i = 0; i < 2; i++) {
1214    // 1-D idct
1215
1216    // Load input data.
1217    in[0] = _mm_load_si128((const __m128i *)input);
1218    in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
1219    in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
1220    in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
1221    in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
1222    in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
1223    in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
1224    in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
1225    in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8));
1226    in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9));
1227    in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10));
1228    in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11));
1229    in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12));
1230    in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13));
1231    in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14));
1232    in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15));
1233
1234    array_transpose_8x8(in, in);
1235    array_transpose_8x8(in + 8, in + 8);
1236
1237    IDCT16
1238
1239    // Stage7
1240    curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
1241    curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
1242    curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
1243    curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
1244    curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
1245    curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
1246    curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
1247    curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
1248    curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
1249    curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
1250    curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
1251    curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
1252    curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
1253    curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
1254    curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
1255    curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
1256
1257    curr1 = r;
1258    input += 128;
1259  }
1260  for (i = 0; i < 2; i++) {
1261    int j;
1262    // 1-D idct
1263    array_transpose_8x8(l + i * 8, in);
1264    array_transpose_8x8(r + i * 8, in + 8);
1265
1266    IDCT16
1267
1268    // 2-D
1269    in[0] = _mm_add_epi16(stp2_0, stp1_15);
1270    in[1] = _mm_add_epi16(stp2_1, stp1_14);
1271    in[2] = _mm_add_epi16(stp2_2, stp2_13);
1272    in[3] = _mm_add_epi16(stp2_3, stp2_12);
1273    in[4] = _mm_add_epi16(stp2_4, stp2_11);
1274    in[5] = _mm_add_epi16(stp2_5, stp2_10);
1275    in[6] = _mm_add_epi16(stp2_6, stp1_9);
1276    in[7] = _mm_add_epi16(stp2_7, stp1_8);
1277    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
1278    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
1279    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
1280    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
1281    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
1282    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
1283    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
1284    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
1285
1286    for (j = 0; j < 16; ++j) {
1287      // Final rounding and shift
1288      in[j] = _mm_adds_epi16(in[j], final_rounding);
1289      in[j] = _mm_srai_epi16(in[j], 6);
1290      RECON_AND_STORE(dest + j * stride, in[j]);
1291    }
1292
1293    dest += 8;
1294  }
1295}
1296
1297void vpx_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
1298  __m128i dc_value;
1299  const __m128i zero = _mm_setzero_si128();
1300  int a, i;
1301
1302  a = dct_const_round_shift(input[0] * cospi_16_64);
1303  a = dct_const_round_shift(a * cospi_16_64);
1304  a = ROUND_POWER_OF_TWO(a, 6);
1305
1306  dc_value = _mm_set1_epi16(a);
1307
1308  for (i = 0; i < 2; ++i) {
1309    RECON_AND_STORE(dest +  0 * stride, dc_value);
1310    RECON_AND_STORE(dest +  1 * stride, dc_value);
1311    RECON_AND_STORE(dest +  2 * stride, dc_value);
1312    RECON_AND_STORE(dest +  3 * stride, dc_value);
1313    RECON_AND_STORE(dest +  4 * stride, dc_value);
1314    RECON_AND_STORE(dest +  5 * stride, dc_value);
1315    RECON_AND_STORE(dest +  6 * stride, dc_value);
1316    RECON_AND_STORE(dest +  7 * stride, dc_value);
1317    RECON_AND_STORE(dest +  8 * stride, dc_value);
1318    RECON_AND_STORE(dest +  9 * stride, dc_value);
1319    RECON_AND_STORE(dest + 10 * stride, dc_value);
1320    RECON_AND_STORE(dest + 11 * stride, dc_value);
1321    RECON_AND_STORE(dest + 12 * stride, dc_value);
1322    RECON_AND_STORE(dest + 13 * stride, dc_value);
1323    RECON_AND_STORE(dest + 14 * stride, dc_value);
1324    RECON_AND_STORE(dest + 15 * stride, dc_value);
1325    dest += 8;
1326  }
1327}
1328
1329static void iadst16_8col(__m128i *in) {
1330  // perform 16x16 1-D ADST for 8 columns
1331  __m128i s[16], x[16], u[32], v[32];
1332  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
1333  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
1334  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
1335  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
1336  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
1337  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
1338  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
1339  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
1340  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
1341  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
1342  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
1343  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
1344  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
1345  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
1346  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
1347  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
1348  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1349  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1350  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1351  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1352  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
1353  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
1354  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1355  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1356  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
1357  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
1358  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
1359  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1360  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1361  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1362  const __m128i kZero = _mm_set1_epi16(0);
1363
1364  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
1365  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
1366  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
1367  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
1368  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
1369  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
1370  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
1371  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
1372  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
1373  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
1374  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
1375  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
1376  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
1377  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
1378  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
1379  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
1380
1381  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
1382  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
1383  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
1384  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
1385  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
1386  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
1387  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
1388  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
1389  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
1390  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
1391  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
1392  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
1393  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
1394  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
1395  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
1396  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
1397  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
1398  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
1399  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
1400  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
1401  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
1402  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
1403  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
1404  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
1405  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
1406  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
1407  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
1408  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
1409  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
1410  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
1411  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
1412  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
1413
1414  u[0] = _mm_add_epi32(v[0], v[16]);
1415  u[1] = _mm_add_epi32(v[1], v[17]);
1416  u[2] = _mm_add_epi32(v[2], v[18]);
1417  u[3] = _mm_add_epi32(v[3], v[19]);
1418  u[4] = _mm_add_epi32(v[4], v[20]);
1419  u[5] = _mm_add_epi32(v[5], v[21]);
1420  u[6] = _mm_add_epi32(v[6], v[22]);
1421  u[7] = _mm_add_epi32(v[7], v[23]);
1422  u[8] = _mm_add_epi32(v[8], v[24]);
1423  u[9] = _mm_add_epi32(v[9], v[25]);
1424  u[10] = _mm_add_epi32(v[10], v[26]);
1425  u[11] = _mm_add_epi32(v[11], v[27]);
1426  u[12] = _mm_add_epi32(v[12], v[28]);
1427  u[13] = _mm_add_epi32(v[13], v[29]);
1428  u[14] = _mm_add_epi32(v[14], v[30]);
1429  u[15] = _mm_add_epi32(v[15], v[31]);
1430  u[16] = _mm_sub_epi32(v[0], v[16]);
1431  u[17] = _mm_sub_epi32(v[1], v[17]);
1432  u[18] = _mm_sub_epi32(v[2], v[18]);
1433  u[19] = _mm_sub_epi32(v[3], v[19]);
1434  u[20] = _mm_sub_epi32(v[4], v[20]);
1435  u[21] = _mm_sub_epi32(v[5], v[21]);
1436  u[22] = _mm_sub_epi32(v[6], v[22]);
1437  u[23] = _mm_sub_epi32(v[7], v[23]);
1438  u[24] = _mm_sub_epi32(v[8], v[24]);
1439  u[25] = _mm_sub_epi32(v[9], v[25]);
1440  u[26] = _mm_sub_epi32(v[10], v[26]);
1441  u[27] = _mm_sub_epi32(v[11], v[27]);
1442  u[28] = _mm_sub_epi32(v[12], v[28]);
1443  u[29] = _mm_sub_epi32(v[13], v[29]);
1444  u[30] = _mm_sub_epi32(v[14], v[30]);
1445  u[31] = _mm_sub_epi32(v[15], v[31]);
1446
1447  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1448  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1449  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1450  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1451  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1452  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1453  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1454  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1455  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1456  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1457  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1458  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1459  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1460  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1461  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1462  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1463  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
1464  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
1465  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
1466  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
1467  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
1468  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
1469  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
1470  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
1471  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
1472  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
1473  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
1474  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
1475  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
1476  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
1477  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
1478  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
1479
1480  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1481  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1482  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1483  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1484  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1485  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1486  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1487  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1488  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1489  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1490  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1491  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1492  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1493  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1494  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1495  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1496  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
1497  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
1498  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
1499  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
1500  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
1501  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
1502  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
1503  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
1504  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
1505  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
1506  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
1507  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
1508  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
1509  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
1510  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
1511  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
1512
1513  s[0] = _mm_packs_epi32(u[0], u[1]);
1514  s[1] = _mm_packs_epi32(u[2], u[3]);
1515  s[2] = _mm_packs_epi32(u[4], u[5]);
1516  s[3] = _mm_packs_epi32(u[6], u[7]);
1517  s[4] = _mm_packs_epi32(u[8], u[9]);
1518  s[5] = _mm_packs_epi32(u[10], u[11]);
1519  s[6] = _mm_packs_epi32(u[12], u[13]);
1520  s[7] = _mm_packs_epi32(u[14], u[15]);
1521  s[8] = _mm_packs_epi32(u[16], u[17]);
1522  s[9] = _mm_packs_epi32(u[18], u[19]);
1523  s[10] = _mm_packs_epi32(u[20], u[21]);
1524  s[11] = _mm_packs_epi32(u[22], u[23]);
1525  s[12] = _mm_packs_epi32(u[24], u[25]);
1526  s[13] = _mm_packs_epi32(u[26], u[27]);
1527  s[14] = _mm_packs_epi32(u[28], u[29]);
1528  s[15] = _mm_packs_epi32(u[30], u[31]);
1529
1530  // stage 2
1531  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
1532  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
1533  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
1534  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
1535  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
1536  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
1537  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1538  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1539
1540  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1541  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1542  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1543  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1544  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1545  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1546  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1547  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1548  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
1549  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
1550  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
1551  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
1552  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
1553  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
1554  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
1555  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
1556
1557  u[0] = _mm_add_epi32(v[0], v[8]);
1558  u[1] = _mm_add_epi32(v[1], v[9]);
1559  u[2] = _mm_add_epi32(v[2], v[10]);
1560  u[3] = _mm_add_epi32(v[3], v[11]);
1561  u[4] = _mm_add_epi32(v[4], v[12]);
1562  u[5] = _mm_add_epi32(v[5], v[13]);
1563  u[6] = _mm_add_epi32(v[6], v[14]);
1564  u[7] = _mm_add_epi32(v[7], v[15]);
1565  u[8] = _mm_sub_epi32(v[0], v[8]);
1566  u[9] = _mm_sub_epi32(v[1], v[9]);
1567  u[10] = _mm_sub_epi32(v[2], v[10]);
1568  u[11] = _mm_sub_epi32(v[3], v[11]);
1569  u[12] = _mm_sub_epi32(v[4], v[12]);
1570  u[13] = _mm_sub_epi32(v[5], v[13]);
1571  u[14] = _mm_sub_epi32(v[6], v[14]);
1572  u[15] = _mm_sub_epi32(v[7], v[15]);
1573
1574  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1575  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1576  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1577  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1578  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1579  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1580  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1581  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1582  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1583  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1584  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1585  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1586  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1587  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1588  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1589  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1590
1591  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1592  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1593  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1594  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1595  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1596  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1597  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1598  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1599  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1600  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1601  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1602  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1603  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1604  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1605  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1606  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1607
1608  x[0] = _mm_add_epi16(s[0], s[4]);
1609  x[1] = _mm_add_epi16(s[1], s[5]);
1610  x[2] = _mm_add_epi16(s[2], s[6]);
1611  x[3] = _mm_add_epi16(s[3], s[7]);
1612  x[4] = _mm_sub_epi16(s[0], s[4]);
1613  x[5] = _mm_sub_epi16(s[1], s[5]);
1614  x[6] = _mm_sub_epi16(s[2], s[6]);
1615  x[7] = _mm_sub_epi16(s[3], s[7]);
1616  x[8] = _mm_packs_epi32(u[0], u[1]);
1617  x[9] = _mm_packs_epi32(u[2], u[3]);
1618  x[10] = _mm_packs_epi32(u[4], u[5]);
1619  x[11] = _mm_packs_epi32(u[6], u[7]);
1620  x[12] = _mm_packs_epi32(u[8], u[9]);
1621  x[13] = _mm_packs_epi32(u[10], u[11]);
1622  x[14] = _mm_packs_epi32(u[12], u[13]);
1623  x[15] = _mm_packs_epi32(u[14], u[15]);
1624
1625  // stage 3
1626  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
1627  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
1628  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
1629  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
1630  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
1631  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
1632  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
1633  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
1634
1635  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
1636  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
1637  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
1638  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
1639  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
1640  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
1641  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1642  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1643  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
1644  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
1645  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
1646  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
1647  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
1648  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
1649  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
1650  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
1651
1652  u[0] = _mm_add_epi32(v[0], v[4]);
1653  u[1] = _mm_add_epi32(v[1], v[5]);
1654  u[2] = _mm_add_epi32(v[2], v[6]);
1655  u[3] = _mm_add_epi32(v[3], v[7]);
1656  u[4] = _mm_sub_epi32(v[0], v[4]);
1657  u[5] = _mm_sub_epi32(v[1], v[5]);
1658  u[6] = _mm_sub_epi32(v[2], v[6]);
1659  u[7] = _mm_sub_epi32(v[3], v[7]);
1660  u[8] = _mm_add_epi32(v[8], v[12]);
1661  u[9] = _mm_add_epi32(v[9], v[13]);
1662  u[10] = _mm_add_epi32(v[10], v[14]);
1663  u[11] = _mm_add_epi32(v[11], v[15]);
1664  u[12] = _mm_sub_epi32(v[8], v[12]);
1665  u[13] = _mm_sub_epi32(v[9], v[13]);
1666  u[14] = _mm_sub_epi32(v[10], v[14]);
1667  u[15] = _mm_sub_epi32(v[11], v[15]);
1668
1669  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1670  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1671  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1672  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1673  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1674  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1675  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1676  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1677  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1678  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1679  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1680  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1681  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1682  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1683  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1684  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1685
1686  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1687  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1688  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1689  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1690  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1691  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1692  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1693  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1694  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1695  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1696  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1697  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1698  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1699  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1700  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1701  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1702
1703  s[0] = _mm_add_epi16(x[0], x[2]);
1704  s[1] = _mm_add_epi16(x[1], x[3]);
1705  s[2] = _mm_sub_epi16(x[0], x[2]);
1706  s[3] = _mm_sub_epi16(x[1], x[3]);
1707  s[4] = _mm_packs_epi32(v[0], v[1]);
1708  s[5] = _mm_packs_epi32(v[2], v[3]);
1709  s[6] = _mm_packs_epi32(v[4], v[5]);
1710  s[7] = _mm_packs_epi32(v[6], v[7]);
1711  s[8] = _mm_add_epi16(x[8], x[10]);
1712  s[9] = _mm_add_epi16(x[9], x[11]);
1713  s[10] = _mm_sub_epi16(x[8], x[10]);
1714  s[11] = _mm_sub_epi16(x[9], x[11]);
1715  s[12] = _mm_packs_epi32(v[8], v[9]);
1716  s[13] = _mm_packs_epi32(v[10], v[11]);
1717  s[14] = _mm_packs_epi32(v[12], v[13]);
1718  s[15] = _mm_packs_epi32(v[14], v[15]);
1719
1720  // stage 4
1721  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
1722  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
1723  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
1724  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
1725  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
1726  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
1727  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1728  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1729
1730  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
1731  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
1732  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
1733  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
1734  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
1735  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
1736  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
1737  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
1738  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
1739  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
1740  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
1741  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
1742  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
1743  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
1744  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
1745  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
1746
1747  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1748  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1749  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1750  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1751  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1752  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1753  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1754  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1755  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1756  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1757  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1758  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1759  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1760  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1761  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1762  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1763
1764  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1765  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1766  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1767  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1768  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1769  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1770  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1771  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1772  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1773  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1774  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1775  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1776  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1777  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1778  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1779  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1780
1781  in[0] = s[0];
1782  in[1] = _mm_sub_epi16(kZero, s[8]);
1783  in[2] = s[12];
1784  in[3] = _mm_sub_epi16(kZero, s[4]);
1785  in[4] = _mm_packs_epi32(v[4], v[5]);
1786  in[5] = _mm_packs_epi32(v[12], v[13]);
1787  in[6] = _mm_packs_epi32(v[8], v[9]);
1788  in[7] = _mm_packs_epi32(v[0], v[1]);
1789  in[8] = _mm_packs_epi32(v[2], v[3]);
1790  in[9] = _mm_packs_epi32(v[10], v[11]);
1791  in[10] = _mm_packs_epi32(v[14], v[15]);
1792  in[11] = _mm_packs_epi32(v[6], v[7]);
1793  in[12] = s[5];
1794  in[13] = _mm_sub_epi16(kZero, s[13]);
1795  in[14] = s[9];
1796  in[15] = _mm_sub_epi16(kZero, s[1]);
1797}
1798
1799static void idct16_8col(__m128i *in) {
1800  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1801  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
1802  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1803  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
1804  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1805  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
1806  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1807  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
1808  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1809  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1810  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1811  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1812  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
1813  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1814  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1815  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1816  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1817  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
1818  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1819  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1820  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1821  __m128i v[16], u[16], s[16], t[16];
1822
1823  // stage 1
1824  s[0] = in[0];
1825  s[1] = in[8];
1826  s[2] = in[4];
1827  s[3] = in[12];
1828  s[4] = in[2];
1829  s[5] = in[10];
1830  s[6] = in[6];
1831  s[7] = in[14];
1832  s[8] = in[1];
1833  s[9] = in[9];
1834  s[10] = in[5];
1835  s[11] = in[13];
1836  s[12] = in[3];
1837  s[13] = in[11];
1838  s[14] = in[7];
1839  s[15] = in[15];
1840
1841  // stage 2
1842  u[0] = _mm_unpacklo_epi16(s[8], s[15]);
1843  u[1] = _mm_unpackhi_epi16(s[8], s[15]);
1844  u[2] = _mm_unpacklo_epi16(s[9], s[14]);
1845  u[3] = _mm_unpackhi_epi16(s[9], s[14]);
1846  u[4] = _mm_unpacklo_epi16(s[10], s[13]);
1847  u[5] = _mm_unpackhi_epi16(s[10], s[13]);
1848  u[6] = _mm_unpacklo_epi16(s[11], s[12]);
1849  u[7] = _mm_unpackhi_epi16(s[11], s[12]);
1850
1851  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
1852  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
1853  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
1854  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
1855  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
1856  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
1857  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
1858  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
1859  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
1860  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
1861  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
1862  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
1863  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
1864  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
1865  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
1866  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
1867
1868  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1869  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1870  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1871  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1872  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1873  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1874  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1875  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1876  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1877  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1878  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1879  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1880  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1881  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1882  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1883  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1884
1885  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1886  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1887  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1888  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1889  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1890  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1891  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1892  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1893  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1894  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1895  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1896  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1897  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1898  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1899  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1900  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1901
1902  s[8]  = _mm_packs_epi32(u[0], u[1]);
1903  s[15] = _mm_packs_epi32(u[2], u[3]);
1904  s[9]  = _mm_packs_epi32(u[4], u[5]);
1905  s[14] = _mm_packs_epi32(u[6], u[7]);
1906  s[10] = _mm_packs_epi32(u[8], u[9]);
1907  s[13] = _mm_packs_epi32(u[10], u[11]);
1908  s[11] = _mm_packs_epi32(u[12], u[13]);
1909  s[12] = _mm_packs_epi32(u[14], u[15]);
1910
1911  // stage 3
1912  t[0] = s[0];
1913  t[1] = s[1];
1914  t[2] = s[2];
1915  t[3] = s[3];
1916  u[0] = _mm_unpacklo_epi16(s[4], s[7]);
1917  u[1] = _mm_unpackhi_epi16(s[4], s[7]);
1918  u[2] = _mm_unpacklo_epi16(s[5], s[6]);
1919  u[3] = _mm_unpackhi_epi16(s[5], s[6]);
1920
1921  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1922  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1923  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1924  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1925  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1926  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1927  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1928  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1929
1930  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1931  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1932  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1933  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1934  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1935  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1936  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1937  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1938
1939  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1940  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1941  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1942  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1943  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1944  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1945  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1946  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1947
1948  t[4] = _mm_packs_epi32(u[0], u[1]);
1949  t[7] = _mm_packs_epi32(u[2], u[3]);
1950  t[5] = _mm_packs_epi32(u[4], u[5]);
1951  t[6] = _mm_packs_epi32(u[6], u[7]);
1952  t[8] = _mm_add_epi16(s[8], s[9]);
1953  t[9] = _mm_sub_epi16(s[8], s[9]);
1954  t[10] = _mm_sub_epi16(s[11], s[10]);
1955  t[11] = _mm_add_epi16(s[10], s[11]);
1956  t[12] = _mm_add_epi16(s[12], s[13]);
1957  t[13] = _mm_sub_epi16(s[12], s[13]);
1958  t[14] = _mm_sub_epi16(s[15], s[14]);
1959  t[15] = _mm_add_epi16(s[14], s[15]);
1960
1961  // stage 4
1962  u[0] = _mm_unpacklo_epi16(t[0], t[1]);
1963  u[1] = _mm_unpackhi_epi16(t[0], t[1]);
1964  u[2] = _mm_unpacklo_epi16(t[2], t[3]);
1965  u[3] = _mm_unpackhi_epi16(t[2], t[3]);
1966  u[4] = _mm_unpacklo_epi16(t[9], t[14]);
1967  u[5] = _mm_unpackhi_epi16(t[9], t[14]);
1968  u[6] = _mm_unpacklo_epi16(t[10], t[13]);
1969  u[7] = _mm_unpackhi_epi16(t[10], t[13]);
1970
1971  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
1972  v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
1973  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
1974  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
1975  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
1976  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
1977  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1978  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1979  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
1980  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
1981  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
1982  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
1983  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
1984  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
1985  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
1986  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
1987
1988  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1989  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1990  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1991  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1992  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1993  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1994  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1995  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1996  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1997  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1998  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1999  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
2000  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
2001  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
2002  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
2003  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
2004
2005  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2006  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2007  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2008  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2009  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2010  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2011  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2012  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2013  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
2014  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
2015  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
2016  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
2017  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
2018  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
2019  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
2020  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
2021
2022  s[0] = _mm_packs_epi32(u[0], u[1]);
2023  s[1] = _mm_packs_epi32(u[2], u[3]);
2024  s[2] = _mm_packs_epi32(u[4], u[5]);
2025  s[3] = _mm_packs_epi32(u[6], u[7]);
2026  s[4] = _mm_add_epi16(t[4], t[5]);
2027  s[5] = _mm_sub_epi16(t[4], t[5]);
2028  s[6] = _mm_sub_epi16(t[7], t[6]);
2029  s[7] = _mm_add_epi16(t[6], t[7]);
2030  s[8] = t[8];
2031  s[15] = t[15];
2032  s[9]  = _mm_packs_epi32(u[8], u[9]);
2033  s[14] = _mm_packs_epi32(u[10], u[11]);
2034  s[10] = _mm_packs_epi32(u[12], u[13]);
2035  s[13] = _mm_packs_epi32(u[14], u[15]);
2036  s[11] = t[11];
2037  s[12] = t[12];
2038
2039  // stage 5
2040  t[0] = _mm_add_epi16(s[0], s[3]);
2041  t[1] = _mm_add_epi16(s[1], s[2]);
2042  t[2] = _mm_sub_epi16(s[1], s[2]);
2043  t[3] = _mm_sub_epi16(s[0], s[3]);
2044  t[4] = s[4];
2045  t[7] = s[7];
2046
2047  u[0] = _mm_unpacklo_epi16(s[5], s[6]);
2048  u[1] = _mm_unpackhi_epi16(s[5], s[6]);
2049  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
2050  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
2051  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2052  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2053  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2054  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2055  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2056  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2057  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2058  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2059  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2060  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2061  t[5] = _mm_packs_epi32(u[0], u[1]);
2062  t[6] = _mm_packs_epi32(u[2], u[3]);
2063
2064  t[8] = _mm_add_epi16(s[8], s[11]);
2065  t[9] = _mm_add_epi16(s[9], s[10]);
2066  t[10] = _mm_sub_epi16(s[9], s[10]);
2067  t[11] = _mm_sub_epi16(s[8], s[11]);
2068  t[12] = _mm_sub_epi16(s[15], s[12]);
2069  t[13] = _mm_sub_epi16(s[14], s[13]);
2070  t[14] = _mm_add_epi16(s[13], s[14]);
2071  t[15] = _mm_add_epi16(s[12], s[15]);
2072
2073  // stage 6
2074  s[0] = _mm_add_epi16(t[0], t[7]);
2075  s[1] = _mm_add_epi16(t[1], t[6]);
2076  s[2] = _mm_add_epi16(t[2], t[5]);
2077  s[3] = _mm_add_epi16(t[3], t[4]);
2078  s[4] = _mm_sub_epi16(t[3], t[4]);
2079  s[5] = _mm_sub_epi16(t[2], t[5]);
2080  s[6] = _mm_sub_epi16(t[1], t[6]);
2081  s[7] = _mm_sub_epi16(t[0], t[7]);
2082  s[8] = t[8];
2083  s[9] = t[9];
2084
2085  u[0] = _mm_unpacklo_epi16(t[10], t[13]);
2086  u[1] = _mm_unpackhi_epi16(t[10], t[13]);
2087  u[2] = _mm_unpacklo_epi16(t[11], t[12]);
2088  u[3] = _mm_unpackhi_epi16(t[11], t[12]);
2089
2090  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
2091  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
2092  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2093  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2094  v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
2095  v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
2096  v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
2097  v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
2098
2099  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2100  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2101  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2102  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2103  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2104  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2105  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2106  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2107
2108  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2109  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2110  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2111  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2112  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2113  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2114  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2115  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2116
2117  s[10] = _mm_packs_epi32(u[0], u[1]);
2118  s[13] = _mm_packs_epi32(u[2], u[3]);
2119  s[11] = _mm_packs_epi32(u[4], u[5]);
2120  s[12] = _mm_packs_epi32(u[6], u[7]);
2121  s[14] = t[14];
2122  s[15] = t[15];
2123
2124  // stage 7
2125  in[0] = _mm_add_epi16(s[0], s[15]);
2126  in[1] = _mm_add_epi16(s[1], s[14]);
2127  in[2] = _mm_add_epi16(s[2], s[13]);
2128  in[3] = _mm_add_epi16(s[3], s[12]);
2129  in[4] = _mm_add_epi16(s[4], s[11]);
2130  in[5] = _mm_add_epi16(s[5], s[10]);
2131  in[6] = _mm_add_epi16(s[6], s[9]);
2132  in[7] = _mm_add_epi16(s[7], s[8]);
2133  in[8] = _mm_sub_epi16(s[7], s[8]);
2134  in[9] = _mm_sub_epi16(s[6], s[9]);
2135  in[10] = _mm_sub_epi16(s[5], s[10]);
2136  in[11] = _mm_sub_epi16(s[4], s[11]);
2137  in[12] = _mm_sub_epi16(s[3], s[12]);
2138  in[13] = _mm_sub_epi16(s[2], s[13]);
2139  in[14] = _mm_sub_epi16(s[1], s[14]);
2140  in[15] = _mm_sub_epi16(s[0], s[15]);
2141}
2142
2143void idct16_sse2(__m128i *in0, __m128i *in1) {
2144  array_transpose_16x16(in0, in1);
2145  idct16_8col(in0);
2146  idct16_8col(in1);
2147}
2148
2149void iadst16_sse2(__m128i *in0, __m128i *in1) {
2150  array_transpose_16x16(in0, in1);
2151  iadst16_8col(in0);
2152  iadst16_8col(in1);
2153}
2154
2155void vpx_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
2156                               int stride) {
2157  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
2158  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
2159  const __m128i zero = _mm_setzero_si128();
2160
2161  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2162  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
2163  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2164  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
2165
2166  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2167  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
2168
2169  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
2170  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2171  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2172  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
2173  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
2174  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2175
2176  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
2177  __m128i in[16], l[16];
2178  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6,
2179          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
2180          stp1_8_0, stp1_12_0;
2181  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
2182          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
2183  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2184  int i;
2185  // First 1-D inverse DCT
2186  // Load input data.
2187  in[0] = _mm_load_si128((const __m128i *)input);
2188  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
2189  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
2190  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
2191
2192  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
2193
2194  // Stage2
2195  {
2196    const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
2197    const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
2198
2199    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
2200    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
2201    tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
2202    tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
2203
2204    tmp0 = _mm_add_epi32(tmp0, rounding);
2205    tmp2 = _mm_add_epi32(tmp2, rounding);
2206    tmp5 = _mm_add_epi32(tmp5, rounding);
2207    tmp7 = _mm_add_epi32(tmp7, rounding);
2208
2209    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2210    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2211    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2212    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2213
2214    stp2_8  = _mm_packs_epi32(tmp0, tmp2);
2215    stp2_11 = _mm_packs_epi32(tmp5, tmp7);
2216  }
2217
2218  // Stage3
2219  {
2220    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
2221
2222    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
2223    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
2224
2225    tmp0 = _mm_add_epi32(tmp0, rounding);
2226    tmp2 = _mm_add_epi32(tmp2, rounding);
2227    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2228    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2229
2230    stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
2231    stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
2232
2233    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
2234  }
2235
2236  // Stage4
2237  {
2238    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
2239    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
2240    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
2241
2242    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
2243    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
2244    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
2245    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
2246    tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
2247    tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
2248
2249    tmp0 = _mm_add_epi32(tmp0, rounding);
2250    tmp2 = _mm_add_epi32(tmp2, rounding);
2251    tmp1 = _mm_add_epi32(tmp1, rounding);
2252    tmp3 = _mm_add_epi32(tmp3, rounding);
2253    tmp5 = _mm_add_epi32(tmp5, rounding);
2254    tmp7 = _mm_add_epi32(tmp7, rounding);
2255
2256    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2257    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2258    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2259    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2260    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2261    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2262
2263    stp1_0 = _mm_packs_epi32(tmp0, tmp0);
2264    stp1_1 = _mm_packs_epi32(tmp2, tmp2);
2265    stp2_9 = _mm_packs_epi32(tmp1, tmp3);
2266    stp2_10 = _mm_packs_epi32(tmp5, tmp7);
2267
2268    stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
2269  }
2270
2271  // Stage5 and Stage6
2272  {
2273    tmp0 = _mm_add_epi16(stp2_8, stp2_11);
2274    tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
2275    tmp2 = _mm_add_epi16(stp2_9, stp2_10);
2276    tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
2277
2278    stp1_9  = _mm_unpacklo_epi64(tmp2, zero);
2279    stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
2280    stp1_8  = _mm_unpacklo_epi64(tmp0, zero);
2281    stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
2282
2283    stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
2284    stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
2285    stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
2286    stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
2287  }
2288
2289  // Stage6
2290  {
2291    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
2292    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
2293    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
2294
2295    tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
2296    tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
2297    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
2298    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
2299    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
2300    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
2301
2302    tmp1 = _mm_add_epi32(tmp1, rounding);
2303    tmp3 = _mm_add_epi32(tmp3, rounding);
2304    tmp0 = _mm_add_epi32(tmp0, rounding);
2305    tmp2 = _mm_add_epi32(tmp2, rounding);
2306    tmp4 = _mm_add_epi32(tmp4, rounding);
2307    tmp6 = _mm_add_epi32(tmp6, rounding);
2308
2309    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2310    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2311    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2312    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2313    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2314    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2315
2316    stp1_6 = _mm_packs_epi32(tmp3, tmp1);
2317
2318    stp2_10 = _mm_packs_epi32(tmp0, zero);
2319    stp2_13 = _mm_packs_epi32(tmp2, zero);
2320    stp2_11 = _mm_packs_epi32(tmp4, zero);
2321    stp2_12 = _mm_packs_epi32(tmp6, zero);
2322
2323    tmp0 = _mm_add_epi16(stp1_0, stp1_4);
2324    tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
2325    tmp2 = _mm_add_epi16(stp1_1, stp1_6);
2326    tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
2327
2328    stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
2329    stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
2330    stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
2331    stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
2332    stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
2333    stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
2334    stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
2335    stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
2336  }
2337
2338  // Stage7. Left 8x16 only.
2339  l[0] = _mm_add_epi16(stp2_0, stp1_15);
2340  l[1] = _mm_add_epi16(stp2_1, stp1_14);
2341  l[2] = _mm_add_epi16(stp2_2, stp2_13);
2342  l[3] = _mm_add_epi16(stp2_3, stp2_12);
2343  l[4] = _mm_add_epi16(stp2_4, stp2_11);
2344  l[5] = _mm_add_epi16(stp2_5, stp2_10);
2345  l[6] = _mm_add_epi16(stp2_6, stp1_9);
2346  l[7] = _mm_add_epi16(stp2_7, stp1_8);
2347  l[8] = _mm_sub_epi16(stp2_7, stp1_8);
2348  l[9] = _mm_sub_epi16(stp2_6, stp1_9);
2349  l[10] = _mm_sub_epi16(stp2_5, stp2_10);
2350  l[11] = _mm_sub_epi16(stp2_4, stp2_11);
2351  l[12] = _mm_sub_epi16(stp2_3, stp2_12);
2352  l[13] = _mm_sub_epi16(stp2_2, stp2_13);
2353  l[14] = _mm_sub_epi16(stp2_1, stp1_14);
2354  l[15] = _mm_sub_epi16(stp2_0, stp1_15);
2355
2356  // Second 1-D inverse transform, performed per 8x16 block
2357  for (i = 0; i < 2; i++) {
2358    int j;
2359    array_transpose_4X8(l + 8 * i, in);
2360
2361    IDCT16_10
2362
2363    // Stage7
2364    in[0] = _mm_add_epi16(stp2_0, stp1_15);
2365    in[1] = _mm_add_epi16(stp2_1, stp1_14);
2366    in[2] = _mm_add_epi16(stp2_2, stp2_13);
2367    in[3] = _mm_add_epi16(stp2_3, stp2_12);
2368    in[4] = _mm_add_epi16(stp2_4, stp2_11);
2369    in[5] = _mm_add_epi16(stp2_5, stp2_10);
2370    in[6] = _mm_add_epi16(stp2_6, stp1_9);
2371    in[7] = _mm_add_epi16(stp2_7, stp1_8);
2372    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
2373    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
2374    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
2375    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
2376    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
2377    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
2378    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
2379    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
2380
2381    for (j = 0; j < 16; ++j) {
2382      // Final rounding and shift
2383      in[j] = _mm_adds_epi16(in[j], final_rounding);
2384      in[j] = _mm_srai_epi16(in[j], 6);
2385      RECON_AND_STORE(dest + j * stride, in[j]);
2386    }
2387
2388    dest += 8;
2389  }
2390}
2391
2392#define LOAD_DQCOEFF(reg, input) \
2393  {  \
2394    reg = _mm_load_si128((const __m128i *) input); \
2395    input += 8; \
2396  }  \
2397
2398#define IDCT32_34 \
2399/* Stage1 */ \
2400{ \
2401  const __m128i zero = _mm_setzero_si128();\
2402  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
2403  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
2404  \
2405  const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
2406  const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
2407  \
2408  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
2409  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
2410  \
2411  const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
2412  const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
2413  \
2414  MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
2415                         stg1_1, stp1_16, stp1_31); \
2416  MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
2417                         stg1_7, stp1_19, stp1_28); \
2418  MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
2419                         stg1_9, stp1_20, stp1_27); \
2420  MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
2421                         stg1_15, stp1_23, stp1_24); \
2422} \
2423\
2424/* Stage2 */ \
2425{ \
2426  const __m128i zero = _mm_setzero_si128();\
2427  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
2428  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
2429  \
2430  const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
2431  const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
2432  \
2433  MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
2434                         stg2_1, stp2_8, stp2_15); \
2435  MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
2436                         stg2_7, stp2_11, stp2_12); \
2437  \
2438  stp2_16 = stp1_16; \
2439  stp2_19 = stp1_19; \
2440  \
2441  stp2_20 = stp1_20; \
2442  stp2_23 = stp1_23; \
2443  \
2444  stp2_24 = stp1_24; \
2445  stp2_27 = stp1_27; \
2446  \
2447  stp2_28 = stp1_28; \
2448  stp2_31 = stp1_31; \
2449} \
2450\
2451/* Stage3 */ \
2452{ \
2453  const __m128i zero = _mm_setzero_si128();\
2454  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
2455  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
2456  \
2457  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
2458  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
2459  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
2460  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
2461  \
2462  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
2463  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
2464  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
2465  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
2466  \
2467  MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
2468                         stg3_1, stp1_4, stp1_7); \
2469  \
2470  stp1_8 = stp2_8; \
2471  stp1_11 = stp2_11; \
2472  stp1_12 = stp2_12; \
2473  stp1_15 = stp2_15; \
2474  \
2475  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
2476                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
2477                         stp1_18, stp1_29) \
2478  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
2479                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
2480                         stp1_22, stp1_25) \
2481  \
2482  stp1_16 = stp2_16; \
2483  stp1_31 = stp2_31; \
2484  stp1_19 = stp2_19; \
2485  stp1_20 = stp2_20; \
2486  stp1_23 = stp2_23; \
2487  stp1_24 = stp2_24; \
2488  stp1_27 = stp2_27; \
2489  stp1_28 = stp2_28; \
2490} \
2491\
2492/* Stage4 */ \
2493{ \
2494  const __m128i zero = _mm_setzero_si128();\
2495  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
2496  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
2497  \
2498  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
2499  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
2500  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
2501  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
2502  \
2503  MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
2504                         stg4_1, stp2_0, stp2_1); \
2505  \
2506  stp2_4 = stp1_4; \
2507  stp2_5 = stp1_4; \
2508  stp2_6 = stp1_7; \
2509  stp2_7 = stp1_7; \
2510  \
2511  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
2512                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
2513                         stp2_10, stp2_13) \
2514  \
2515  stp2_8 = stp1_8; \
2516  stp2_15 = stp1_15; \
2517  stp2_11 = stp1_11; \
2518  stp2_12 = stp1_12; \
2519  \
2520  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
2521  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
2522  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
2523  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
2524  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
2525  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
2526  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
2527  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
2528  \
2529  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
2530  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
2531  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
2532  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
2533  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
2534  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
2535  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
2536  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
2537} \
2538\
2539/* Stage5 */ \
2540{ \
2541  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
2542  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
2543  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2544  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2545  \
2546  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
2547  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
2548  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2549  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2550  \
2551  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2552  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2553  \
2554  stp1_0 = stp2_0; \
2555  stp1_1 = stp2_1; \
2556  stp1_2 = stp2_1; \
2557  stp1_3 = stp2_0; \
2558  \
2559  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
2560  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
2561  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
2562  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
2563  \
2564  tmp0 = _mm_add_epi32(tmp0, rounding); \
2565  tmp1 = _mm_add_epi32(tmp1, rounding); \
2566  tmp2 = _mm_add_epi32(tmp2, rounding); \
2567  tmp3 = _mm_add_epi32(tmp3, rounding); \
2568  \
2569  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
2570  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
2571  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
2572  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
2573  \
2574  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
2575  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
2576  \
2577  stp1_4 = stp2_4; \
2578  stp1_7 = stp2_7; \
2579  \
2580  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
2581  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
2582  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
2583  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
2584  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
2585  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
2586  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
2587  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
2588  \
2589  stp1_16 = stp2_16; \
2590  stp1_17 = stp2_17; \
2591  \
2592  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
2593                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
2594                         stp1_19, stp1_28) \
2595  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
2596                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
2597                         stp1_21, stp1_26) \
2598  \
2599  stp1_22 = stp2_22; \
2600  stp1_23 = stp2_23; \
2601  stp1_24 = stp2_24; \
2602  stp1_25 = stp2_25; \
2603  stp1_30 = stp2_30; \
2604  stp1_31 = stp2_31; \
2605} \
2606\
2607/* Stage6 */ \
2608{ \
2609  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
2610  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
2611  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
2612  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
2613  \
2614  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
2615  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
2616  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
2617  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
2618  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
2619  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
2620  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
2621  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
2622  \
2623  stp2_8 = stp1_8; \
2624  stp2_9 = stp1_9; \
2625  stp2_14 = stp1_14; \
2626  stp2_15 = stp1_15; \
2627  \
2628  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
2629                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
2630                         stp2_13, stp2_11, stp2_12) \
2631  \
2632  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
2633  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
2634  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
2635  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
2636  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
2637  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
2638  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
2639  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
2640  \
2641  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
2642  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
2643  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
2644  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
2645  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
2646  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
2647  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
2648  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
2649} \
2650\
2651/* Stage7 */ \
2652{ \
2653  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2654  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2655  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2656  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2657  \
2658  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
2659  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
2660  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
2661  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
2662  \
2663  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
2664  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
2665  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
2666  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
2667  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
2668  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
2669  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
2670  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
2671  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
2672  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
2673  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
2674  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
2675  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
2676  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
2677  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
2678  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
2679  \
2680  stp1_16 = stp2_16; \
2681  stp1_17 = stp2_17; \
2682  stp1_18 = stp2_18; \
2683  stp1_19 = stp2_19; \
2684  \
2685  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
2686                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
2687                         stp1_21, stp1_26) \
2688  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
2689                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
2690                         stp1_23, stp1_24) \
2691  \
2692  stp1_28 = stp2_28; \
2693  stp1_29 = stp2_29; \
2694  stp1_30 = stp2_30; \
2695  stp1_31 = stp2_31; \
2696}
2697
2698
2699#define IDCT32 \
2700/* Stage1 */ \
2701{ \
2702  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
2703  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
2704  const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
2705  const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
2706  \
2707  const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
2708  const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
2709  const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
2710  const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
2711  \
2712  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
2713  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
2714  const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
2715  const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
2716  \
2717  const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
2718  const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
2719  const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
2720  const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
2721  \
2722  MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
2723                         stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
2724                         stp1_17, stp1_30) \
2725  MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
2726                         stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
2727                         stp1_19, stp1_28) \
2728  MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
2729                         stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
2730                         stp1_21, stp1_26) \
2731  MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
2732                         stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
2733                         stp1_23, stp1_24) \
2734} \
2735\
2736/* Stage2 */ \
2737{ \
2738  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
2739  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
2740  const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
2741  const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
2742  \
2743  const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
2744  const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
2745  const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
2746  const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
2747  \
2748  MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
2749                         stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
2750                         stp2_14) \
2751  MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
2752                         stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
2753                         stp2_11, stp2_12) \
2754  \
2755  stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
2756  stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
2757  stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
2758  stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
2759  \
2760  stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
2761  stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
2762  stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
2763  stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
2764  \
2765  stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
2766  stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
2767  stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
2768  stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
2769  \
2770  stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
2771  stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
2772  stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
2773  stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
2774} \
2775\
2776/* Stage3 */ \
2777{ \
2778  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
2779  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
2780  const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
2781  const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
2782  \
2783  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
2784  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
2785  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2786  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2787  \
2788  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2789  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2790  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
2791  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
2792  \
2793  MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
2794                         stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
2795                         stp1_6) \
2796  \
2797  stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
2798  stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
2799  stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
2800  stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
2801  stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
2802  stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
2803  stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
2804  stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
2805  \
2806  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
2807                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
2808                         stp1_18, stp1_29) \
2809  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
2810                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
2811                         stp1_22, stp1_25) \
2812  \
2813  stp1_16 = stp2_16; \
2814  stp1_31 = stp2_31; \
2815  stp1_19 = stp2_19; \
2816  stp1_20 = stp2_20; \
2817  stp1_23 = stp2_23; \
2818  stp1_24 = stp2_24; \
2819  stp1_27 = stp2_27; \
2820  stp1_28 = stp2_28; \
2821} \
2822\
2823/* Stage4 */ \
2824{ \
2825  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
2826  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
2827  const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
2828  const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
2829  \
2830  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
2831  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
2832  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
2833  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
2834  \
2835  MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
2836                         stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
2837                         stp2_2, stp2_3) \
2838  \
2839  stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
2840  stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
2841  stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
2842  stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
2843  \
2844  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
2845                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
2846                         stp2_10, stp2_13) \
2847  \
2848  stp2_8 = stp1_8; \
2849  stp2_15 = stp1_15; \
2850  stp2_11 = stp1_11; \
2851  stp2_12 = stp1_12; \
2852  \
2853  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
2854  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
2855  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
2856  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
2857  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
2858  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
2859  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
2860  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
2861  \
2862  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
2863  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
2864  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
2865  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
2866  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
2867  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
2868  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
2869  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
2870} \
2871\
2872/* Stage5 */ \
2873{ \
2874  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
2875  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
2876  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2877  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2878  \
2879  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
2880  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
2881  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2882  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2883  \
2884  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2885  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2886  \
2887  stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
2888  stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
2889  stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
2890  stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
2891  \
2892  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
2893  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
2894  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
2895  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
2896  \
2897  tmp0 = _mm_add_epi32(tmp0, rounding); \
2898  tmp1 = _mm_add_epi32(tmp1, rounding); \
2899  tmp2 = _mm_add_epi32(tmp2, rounding); \
2900  tmp3 = _mm_add_epi32(tmp3, rounding); \
2901  \
2902  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
2903  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
2904  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
2905  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
2906  \
2907  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
2908  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
2909  \
2910  stp1_4 = stp2_4; \
2911  stp1_7 = stp2_7; \
2912  \
2913  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
2914  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
2915  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
2916  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
2917  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
2918  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
2919  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
2920  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
2921  \
2922  stp1_16 = stp2_16; \
2923  stp1_17 = stp2_17; \
2924  \
2925  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
2926                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
2927                         stp1_19, stp1_28) \
2928  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
2929                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
2930                         stp1_21, stp1_26) \
2931  \
2932  stp1_22 = stp2_22; \
2933  stp1_23 = stp2_23; \
2934  stp1_24 = stp2_24; \
2935  stp1_25 = stp2_25; \
2936  stp1_30 = stp2_30; \
2937  stp1_31 = stp2_31; \
2938} \
2939\
2940/* Stage6 */ \
2941{ \
2942  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
2943  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
2944  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
2945  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
2946  \
2947  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
2948  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
2949  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
2950  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
2951  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
2952  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
2953  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
2954  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
2955  \
2956  stp2_8 = stp1_8; \
2957  stp2_9 = stp1_9; \
2958  stp2_14 = stp1_14; \
2959  stp2_15 = stp1_15; \
2960  \
2961  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
2962                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
2963                         stp2_13, stp2_11, stp2_12) \
2964  \
2965  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
2966  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
2967  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
2968  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
2969  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
2970  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
2971  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
2972  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
2973  \
2974  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
2975  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
2976  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
2977  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
2978  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
2979  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
2980  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
2981  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
2982} \
2983\
2984/* Stage7 */ \
2985{ \
2986  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2987  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2988  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2989  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2990  \
2991  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
2992  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
2993  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
2994  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
2995  \
2996  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
2997  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
2998  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
2999  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
3000  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
3001  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
3002  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
3003  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
3004  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
3005  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
3006  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
3007  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
3008  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
3009  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
3010  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
3011  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
3012  \
3013  stp1_16 = stp2_16; \
3014  stp1_17 = stp2_17; \
3015  stp1_18 = stp2_18; \
3016  stp1_19 = stp2_19; \
3017  \
3018  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
3019                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
3020                         stp1_21, stp1_26) \
3021  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
3022                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
3023                         stp1_23, stp1_24) \
3024  \
3025  stp1_28 = stp2_28; \
3026  stp1_29 = stp2_29; \
3027  stp1_30 = stp2_30; \
3028  stp1_31 = stp2_31; \
3029}
3030
3031// Only upper-left 8x8 has non-zero coeff
3032void vpx_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
3033                               int stride) {
3034  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3035  const __m128i final_rounding = _mm_set1_epi16(1<<5);
3036
3037  // idct constants for each stage
3038  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3039  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3040  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3041  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3042  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3043  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3044  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3045  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3046
3047  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3048  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3049  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3050  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3051
3052  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3053  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3054  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3055  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3056  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3057  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3058  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3059  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3060
3061  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3062  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3063  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3064  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3065  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3066
3067  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3068
3069  __m128i in[32], col[32];
3070  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3071          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3072          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3073          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3074          stp1_30, stp1_31;
3075  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3076          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3077          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3078          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3079          stp2_30, stp2_31;
3080  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3081  int i;
3082
3083  // Load input data. Only need to load the top left 8x8 block.
3084  in[0] = _mm_load_si128((const __m128i *)input);
3085  in[1] = _mm_load_si128((const __m128i *)(input + 32));
3086  in[2] = _mm_load_si128((const __m128i *)(input + 64));
3087  in[3] = _mm_load_si128((const __m128i *)(input + 96));
3088  in[4] = _mm_load_si128((const __m128i *)(input + 128));
3089  in[5] = _mm_load_si128((const __m128i *)(input + 160));
3090  in[6] = _mm_load_si128((const __m128i *)(input + 192));
3091  in[7] = _mm_load_si128((const __m128i *)(input + 224));
3092
3093  for (i = 8; i < 32; ++i) {
3094    in[i] = _mm_setzero_si128();
3095  }
3096
3097  array_transpose_8x8(in, in);
3098  // TODO(hkuang): Following transposes are unnecessary. But remove them will
3099  // lead to performance drop on some devices.
3100  array_transpose_8x8(in + 8, in + 8);
3101  array_transpose_8x8(in + 16, in + 16);
3102  array_transpose_8x8(in + 24, in + 24);
3103
3104  IDCT32_34
3105
3106  // 1_D: Store 32 intermediate results for each 8x32 block.
3107  col[0] = _mm_add_epi16(stp1_0, stp1_31);
3108  col[1] = _mm_add_epi16(stp1_1, stp1_30);
3109  col[2] = _mm_add_epi16(stp1_2, stp1_29);
3110  col[3] = _mm_add_epi16(stp1_3, stp1_28);
3111  col[4] = _mm_add_epi16(stp1_4, stp1_27);
3112  col[5] = _mm_add_epi16(stp1_5, stp1_26);
3113  col[6] = _mm_add_epi16(stp1_6, stp1_25);
3114  col[7] = _mm_add_epi16(stp1_7, stp1_24);
3115  col[8] = _mm_add_epi16(stp1_8, stp1_23);
3116  col[9] = _mm_add_epi16(stp1_9, stp1_22);
3117  col[10] = _mm_add_epi16(stp1_10, stp1_21);
3118  col[11] = _mm_add_epi16(stp1_11, stp1_20);
3119  col[12] = _mm_add_epi16(stp1_12, stp1_19);
3120  col[13] = _mm_add_epi16(stp1_13, stp1_18);
3121  col[14] = _mm_add_epi16(stp1_14, stp1_17);
3122  col[15] = _mm_add_epi16(stp1_15, stp1_16);
3123  col[16] = _mm_sub_epi16(stp1_15, stp1_16);
3124  col[17] = _mm_sub_epi16(stp1_14, stp1_17);
3125  col[18] = _mm_sub_epi16(stp1_13, stp1_18);
3126  col[19] = _mm_sub_epi16(stp1_12, stp1_19);
3127  col[20] = _mm_sub_epi16(stp1_11, stp1_20);
3128  col[21] = _mm_sub_epi16(stp1_10, stp1_21);
3129  col[22] = _mm_sub_epi16(stp1_9, stp1_22);
3130  col[23] = _mm_sub_epi16(stp1_8, stp1_23);
3131  col[24] = _mm_sub_epi16(stp1_7, stp1_24);
3132  col[25] = _mm_sub_epi16(stp1_6, stp1_25);
3133  col[26] = _mm_sub_epi16(stp1_5, stp1_26);
3134  col[27] = _mm_sub_epi16(stp1_4, stp1_27);
3135  col[28] = _mm_sub_epi16(stp1_3, stp1_28);
3136  col[29] = _mm_sub_epi16(stp1_2, stp1_29);
3137  col[30] = _mm_sub_epi16(stp1_1, stp1_30);
3138  col[31] = _mm_sub_epi16(stp1_0, stp1_31);
3139  for (i = 0; i < 4; i++) {
3140    int j;
3141    const __m128i zero = _mm_setzero_si128();
3142    // Transpose 32x8 block to 8x32 block
3143    array_transpose_8x8(col + i * 8, in);
3144    IDCT32_34
3145
3146    // 2_D: Calculate the results and store them to destination.
3147    in[0] = _mm_add_epi16(stp1_0, stp1_31);
3148    in[1] = _mm_add_epi16(stp1_1, stp1_30);
3149    in[2] = _mm_add_epi16(stp1_2, stp1_29);
3150    in[3] = _mm_add_epi16(stp1_3, stp1_28);
3151    in[4] = _mm_add_epi16(stp1_4, stp1_27);
3152    in[5] = _mm_add_epi16(stp1_5, stp1_26);
3153    in[6] = _mm_add_epi16(stp1_6, stp1_25);
3154    in[7] = _mm_add_epi16(stp1_7, stp1_24);
3155    in[8] = _mm_add_epi16(stp1_8, stp1_23);
3156    in[9] = _mm_add_epi16(stp1_9, stp1_22);
3157    in[10] = _mm_add_epi16(stp1_10, stp1_21);
3158    in[11] = _mm_add_epi16(stp1_11, stp1_20);
3159    in[12] = _mm_add_epi16(stp1_12, stp1_19);
3160    in[13] = _mm_add_epi16(stp1_13, stp1_18);
3161    in[14] = _mm_add_epi16(stp1_14, stp1_17);
3162    in[15] = _mm_add_epi16(stp1_15, stp1_16);
3163    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
3164    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
3165    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
3166    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
3167    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
3168    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
3169    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
3170    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
3171    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
3172    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
3173    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
3174    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
3175    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
3176    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
3177    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
3178    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
3179
3180    for (j = 0; j < 32; ++j) {
3181      // Final rounding and shift
3182      in[j] = _mm_adds_epi16(in[j], final_rounding);
3183      in[j] = _mm_srai_epi16(in[j], 6);
3184      RECON_AND_STORE(dest + j * stride, in[j]);
3185    }
3186
3187    dest += 8;
3188  }
3189}
3190
3191void vpx_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
3192                                 int stride) {
3193  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3194  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
3195  const __m128i zero = _mm_setzero_si128();
3196
3197  // idct constants for each stage
3198  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3199  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3200  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
3201  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
3202  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
3203  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
3204  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3205  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3206  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3207  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3208  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
3209  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
3210  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
3211  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
3212  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3213  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3214
3215  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3216  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3217  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
3218  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
3219  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
3220  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
3221  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3222  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3223
3224  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3225  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3226  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
3227  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
3228  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3229  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3230  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3231  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3232  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3233  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3234
3235  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3236  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3237  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
3238  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
3239  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3240  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3241  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3242
3243  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3244
3245  __m128i in[32], col[128], zero_idx[16];
3246  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3247          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3248          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3249          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3250          stp1_30, stp1_31;
3251  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3252          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3253          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3254          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3255          stp2_30, stp2_31;
3256  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3257  int i, j, i32;
3258
3259  for (i = 0; i < 4; i++) {
3260    i32 = (i << 5);
3261    // First 1-D idct
3262    // Load input data.
3263    LOAD_DQCOEFF(in[0], input);
3264    LOAD_DQCOEFF(in[8], input);
3265    LOAD_DQCOEFF(in[16], input);
3266    LOAD_DQCOEFF(in[24], input);
3267    LOAD_DQCOEFF(in[1], input);
3268    LOAD_DQCOEFF(in[9], input);
3269    LOAD_DQCOEFF(in[17], input);
3270    LOAD_DQCOEFF(in[25], input);
3271    LOAD_DQCOEFF(in[2], input);
3272    LOAD_DQCOEFF(in[10], input);
3273    LOAD_DQCOEFF(in[18], input);
3274    LOAD_DQCOEFF(in[26], input);
3275    LOAD_DQCOEFF(in[3], input);
3276    LOAD_DQCOEFF(in[11], input);
3277    LOAD_DQCOEFF(in[19], input);
3278    LOAD_DQCOEFF(in[27], input);
3279
3280    LOAD_DQCOEFF(in[4], input);
3281    LOAD_DQCOEFF(in[12], input);
3282    LOAD_DQCOEFF(in[20], input);
3283    LOAD_DQCOEFF(in[28], input);
3284    LOAD_DQCOEFF(in[5], input);
3285    LOAD_DQCOEFF(in[13], input);
3286    LOAD_DQCOEFF(in[21], input);
3287    LOAD_DQCOEFF(in[29], input);
3288    LOAD_DQCOEFF(in[6], input);
3289    LOAD_DQCOEFF(in[14], input);
3290    LOAD_DQCOEFF(in[22], input);
3291    LOAD_DQCOEFF(in[30], input);
3292    LOAD_DQCOEFF(in[7], input);
3293    LOAD_DQCOEFF(in[15], input);
3294    LOAD_DQCOEFF(in[23], input);
3295    LOAD_DQCOEFF(in[31], input);
3296
3297    // checking if all entries are zero
3298    zero_idx[0] = _mm_or_si128(in[0], in[1]);
3299    zero_idx[1] = _mm_or_si128(in[2], in[3]);
3300    zero_idx[2] = _mm_or_si128(in[4], in[5]);
3301    zero_idx[3] = _mm_or_si128(in[6], in[7]);
3302    zero_idx[4] = _mm_or_si128(in[8], in[9]);
3303    zero_idx[5] = _mm_or_si128(in[10], in[11]);
3304    zero_idx[6] = _mm_or_si128(in[12], in[13]);
3305    zero_idx[7] = _mm_or_si128(in[14], in[15]);
3306    zero_idx[8] = _mm_or_si128(in[16], in[17]);
3307    zero_idx[9] = _mm_or_si128(in[18], in[19]);
3308    zero_idx[10] = _mm_or_si128(in[20], in[21]);
3309    zero_idx[11] = _mm_or_si128(in[22], in[23]);
3310    zero_idx[12] = _mm_or_si128(in[24], in[25]);
3311    zero_idx[13] = _mm_or_si128(in[26], in[27]);
3312    zero_idx[14] = _mm_or_si128(in[28], in[29]);
3313    zero_idx[15] = _mm_or_si128(in[30], in[31]);
3314
3315    zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3316    zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3317    zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3318    zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3319    zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3320    zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3321    zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3322    zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
3323
3324    zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3325    zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3326    zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3327    zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3328    zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3329    zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3330    zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3331
3332    if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
3333      col[i32 + 0] = _mm_setzero_si128();
3334      col[i32 + 1] = _mm_setzero_si128();
3335      col[i32 + 2] = _mm_setzero_si128();
3336      col[i32 + 3] = _mm_setzero_si128();
3337      col[i32 + 4] = _mm_setzero_si128();
3338      col[i32 + 5] = _mm_setzero_si128();
3339      col[i32 + 6] = _mm_setzero_si128();
3340      col[i32 + 7] = _mm_setzero_si128();
3341      col[i32 + 8] = _mm_setzero_si128();
3342      col[i32 + 9] = _mm_setzero_si128();
3343      col[i32 + 10] = _mm_setzero_si128();
3344      col[i32 + 11] = _mm_setzero_si128();
3345      col[i32 + 12] = _mm_setzero_si128();
3346      col[i32 + 13] = _mm_setzero_si128();
3347      col[i32 + 14] = _mm_setzero_si128();
3348      col[i32 + 15] = _mm_setzero_si128();
3349      col[i32 + 16] = _mm_setzero_si128();
3350      col[i32 + 17] = _mm_setzero_si128();
3351      col[i32 + 18] = _mm_setzero_si128();
3352      col[i32 + 19] = _mm_setzero_si128();
3353      col[i32 + 20] = _mm_setzero_si128();
3354      col[i32 + 21] = _mm_setzero_si128();
3355      col[i32 + 22] = _mm_setzero_si128();
3356      col[i32 + 23] = _mm_setzero_si128();
3357      col[i32 + 24] = _mm_setzero_si128();
3358      col[i32 + 25] = _mm_setzero_si128();
3359      col[i32 + 26] = _mm_setzero_si128();
3360      col[i32 + 27] = _mm_setzero_si128();
3361      col[i32 + 28] = _mm_setzero_si128();
3362      col[i32 + 29] = _mm_setzero_si128();
3363      col[i32 + 30] = _mm_setzero_si128();
3364      col[i32 + 31] = _mm_setzero_si128();
3365      continue;
3366    }
3367
3368    // Transpose 32x8 block to 8x32 block
3369    array_transpose_8x8(in, in);
3370    array_transpose_8x8(in + 8, in + 8);
3371    array_transpose_8x8(in + 16, in + 16);
3372    array_transpose_8x8(in + 24, in + 24);
3373
3374    IDCT32
3375
3376    // 1_D: Store 32 intermediate results for each 8x32 block.
3377    col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
3378    col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
3379    col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
3380    col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
3381    col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
3382    col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
3383    col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
3384    col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
3385    col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
3386    col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
3387    col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
3388    col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
3389    col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
3390    col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
3391    col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
3392    col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
3393    col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
3394    col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
3395    col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
3396    col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
3397    col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
3398    col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
3399    col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
3400    col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
3401    col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
3402    col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
3403    col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
3404    col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
3405    col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
3406    col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
3407    col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
3408    col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
3409  }
3410  for (i = 0; i < 4; i++) {
3411    // Second 1-D idct
3412    j = i << 3;
3413
3414    // Transpose 32x8 block to 8x32 block
3415    array_transpose_8x8(col + j, in);
3416    array_transpose_8x8(col + j + 32, in + 8);
3417    array_transpose_8x8(col + j + 64, in + 16);
3418    array_transpose_8x8(col + j + 96, in + 24);
3419
3420    IDCT32
3421
3422    // 2_D: Calculate the results and store them to destination.
3423    in[0] = _mm_add_epi16(stp1_0, stp1_31);
3424    in[1] = _mm_add_epi16(stp1_1, stp1_30);
3425    in[2] = _mm_add_epi16(stp1_2, stp1_29);
3426    in[3] = _mm_add_epi16(stp1_3, stp1_28);
3427    in[4] = _mm_add_epi16(stp1_4, stp1_27);
3428    in[5] = _mm_add_epi16(stp1_5, stp1_26);
3429    in[6] = _mm_add_epi16(stp1_6, stp1_25);
3430    in[7] = _mm_add_epi16(stp1_7, stp1_24);
3431    in[8] = _mm_add_epi16(stp1_8, stp1_23);
3432    in[9] = _mm_add_epi16(stp1_9, stp1_22);
3433    in[10] = _mm_add_epi16(stp1_10, stp1_21);
3434    in[11] = _mm_add_epi16(stp1_11, stp1_20);
3435    in[12] = _mm_add_epi16(stp1_12, stp1_19);
3436    in[13] = _mm_add_epi16(stp1_13, stp1_18);
3437    in[14] = _mm_add_epi16(stp1_14, stp1_17);
3438    in[15] = _mm_add_epi16(stp1_15, stp1_16);
3439    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
3440    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
3441    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
3442    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
3443    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
3444    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
3445    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
3446    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
3447    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
3448    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
3449    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
3450    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
3451    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
3452    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
3453    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
3454    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
3455
3456    for (j = 0; j < 32; ++j) {
3457      // Final rounding and shift
3458      in[j] = _mm_adds_epi16(in[j], final_rounding);
3459      in[j] = _mm_srai_epi16(in[j], 6);
3460      RECON_AND_STORE(dest + j * stride, in[j]);
3461    }
3462
3463    dest += 8;
3464  }
3465}
3466
3467void vpx_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
3468  __m128i dc_value;
3469  const __m128i zero = _mm_setzero_si128();
3470  int a, i;
3471
3472  a = dct_const_round_shift(input[0] * cospi_16_64);
3473  a = dct_const_round_shift(a * cospi_16_64);
3474  a = ROUND_POWER_OF_TWO(a, 6);
3475
3476  dc_value = _mm_set1_epi16(a);
3477
3478  for (i = 0; i < 4; ++i) {
3479    int j;
3480    for (j = 0; j < 32; ++j) {
3481      RECON_AND_STORE(dest + j * stride, dc_value);
3482    }
3483    dest += 8;
3484  }
3485}
3486
3487#if CONFIG_VP9_HIGHBITDEPTH
3488static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
3489  __m128i ubounded, retval;
3490  const __m128i zero = _mm_set1_epi16(0);
3491  const __m128i one = _mm_set1_epi16(1);
3492  const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
3493  ubounded = _mm_cmpgt_epi16(value, max);
3494  retval = _mm_andnot_si128(ubounded, value);
3495  ubounded = _mm_and_si128(ubounded, max);
3496  retval = _mm_or_si128(retval, ubounded);
3497  retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
3498  return retval;
3499}
3500
3501void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
3502                                    int stride, int bd) {
3503  tran_low_t out[4 * 4];
3504  tran_low_t *outptr = out;
3505  int i, j;
3506  __m128i inptr[4];
3507  __m128i sign_bits[2];
3508  __m128i temp_mm, min_input, max_input;
3509  int test;
3510  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3511  int optimised_cols = 0;
3512  const __m128i zero = _mm_set1_epi16(0);
3513  const __m128i eight = _mm_set1_epi16(8);
3514  const __m128i max = _mm_set1_epi16(12043);
3515  const __m128i min = _mm_set1_epi16(-12043);
3516  // Load input into __m128i
3517  inptr[0] = _mm_loadu_si128((const __m128i *)input);
3518  inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
3519  inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
3520  inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
3521
3522  // Pack to 16 bits
3523  inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
3524  inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
3525
3526  max_input = _mm_max_epi16(inptr[0], inptr[1]);
3527  min_input = _mm_min_epi16(inptr[0], inptr[1]);
3528  max_input = _mm_cmpgt_epi16(max_input, max);
3529  min_input = _mm_cmplt_epi16(min_input, min);
3530  temp_mm = _mm_or_si128(max_input, min_input);
3531  test = _mm_movemask_epi8(temp_mm);
3532
3533  if (!test) {
3534    // Do the row transform
3535    idct4_sse2(inptr);
3536
3537    // Check the min & max values
3538    max_input = _mm_max_epi16(inptr[0], inptr[1]);
3539    min_input = _mm_min_epi16(inptr[0], inptr[1]);
3540    max_input = _mm_cmpgt_epi16(max_input, max);
3541    min_input = _mm_cmplt_epi16(min_input, min);
3542    temp_mm = _mm_or_si128(max_input, min_input);
3543    test = _mm_movemask_epi8(temp_mm);
3544
3545    if (test) {
3546      transpose_4x4(inptr);
3547      sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
3548      sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
3549      inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
3550      inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
3551      inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
3552      inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
3553      _mm_storeu_si128((__m128i *)outptr, inptr[0]);
3554      _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
3555      _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
3556      _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
3557    } else {
3558      // Set to use the optimised transform for the column
3559      optimised_cols = 1;
3560    }
3561  } else {
3562    // Run the un-optimised row transform
3563    for (i = 0; i < 4; ++i) {
3564      vpx_highbd_idct4_c(input, outptr, bd);
3565      input += 4;
3566      outptr += 4;
3567    }
3568  }
3569
3570  if (optimised_cols) {
3571    idct4_sse2(inptr);
3572
3573    // Final round and shift
3574    inptr[0] = _mm_add_epi16(inptr[0], eight);
3575    inptr[1] = _mm_add_epi16(inptr[1], eight);
3576
3577    inptr[0] = _mm_srai_epi16(inptr[0], 4);
3578    inptr[1] = _mm_srai_epi16(inptr[1], 4);
3579
3580    // Reconstruction and Store
3581    {
3582      __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
3583      __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
3584      d0 = _mm_unpacklo_epi64(
3585          d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
3586      d2 = _mm_unpacklo_epi64(
3587          d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
3588      d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
3589      d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
3590      // store input0
3591      _mm_storel_epi64((__m128i *)dest, d0);
3592      // store input1
3593      d0 = _mm_srli_si128(d0, 8);
3594      _mm_storel_epi64((__m128i *)(dest + stride), d0);
3595      // store input2
3596      _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
3597      // store input3
3598      d2 = _mm_srli_si128(d2, 8);
3599      _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
3600    }
3601  } else {
3602    // Run the un-optimised column transform
3603    tran_low_t temp_in[4], temp_out[4];
3604    // Columns
3605    for (i = 0; i < 4; ++i) {
3606      for (j = 0; j < 4; ++j)
3607        temp_in[j] = out[j * 4 + i];
3608      vpx_highbd_idct4_c(temp_in, temp_out, bd);
3609      for (j = 0; j < 4; ++j) {
3610        dest[j * stride + i] = highbd_clip_pixel_add(
3611            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
3612      }
3613    }
3614  }
3615}
3616
3617void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
3618                                    int stride, int bd) {
3619  tran_low_t out[8 * 8];
3620  tran_low_t *outptr = out;
3621  int i, j, test;
3622  __m128i inptr[8];
3623  __m128i min_input, max_input, temp1, temp2, sign_bits;
3624  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3625  const __m128i zero = _mm_set1_epi16(0);
3626  const __m128i sixteen = _mm_set1_epi16(16);
3627  const __m128i max = _mm_set1_epi16(6201);
3628  const __m128i min = _mm_set1_epi16(-6201);
3629  int optimised_cols = 0;
3630
3631  // Load input into __m128i & pack to 16 bits
3632  for (i = 0; i < 8; i++) {
3633    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
3634    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
3635    inptr[i] = _mm_packs_epi32(temp1, temp2);
3636  }
3637
3638  // Find the min & max for the row transform
3639  max_input = _mm_max_epi16(inptr[0], inptr[1]);
3640  min_input = _mm_min_epi16(inptr[0], inptr[1]);
3641  for (i = 2; i < 8; i++) {
3642    max_input = _mm_max_epi16(max_input, inptr[i]);
3643    min_input = _mm_min_epi16(min_input, inptr[i]);
3644  }
3645  max_input = _mm_cmpgt_epi16(max_input, max);
3646  min_input = _mm_cmplt_epi16(min_input, min);
3647  temp1 = _mm_or_si128(max_input, min_input);
3648  test = _mm_movemask_epi8(temp1);
3649
3650  if (!test) {
3651    // Do the row transform
3652    idct8_sse2(inptr);
3653
3654    // Find the min & max for the column transform
3655    max_input = _mm_max_epi16(inptr[0], inptr[1]);
3656    min_input = _mm_min_epi16(inptr[0], inptr[1]);
3657    for (i = 2; i < 8; i++) {
3658      max_input = _mm_max_epi16(max_input, inptr[i]);
3659      min_input = _mm_min_epi16(min_input, inptr[i]);
3660    }
3661    max_input = _mm_cmpgt_epi16(max_input, max);
3662    min_input = _mm_cmplt_epi16(min_input, min);
3663    temp1 = _mm_or_si128(max_input, min_input);
3664    test = _mm_movemask_epi8(temp1);
3665
3666    if (test) {
3667      array_transpose_8x8(inptr, inptr);
3668      for (i = 0; i < 8; i++) {
3669        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
3670        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
3671        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
3672        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
3673        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
3674      }
3675    } else {
3676      // Set to use the optimised transform for the column
3677      optimised_cols = 1;
3678    }
3679  } else {
3680    // Run the un-optimised row transform
3681    for (i = 0; i < 8; ++i) {
3682      vpx_highbd_idct8_c(input, outptr, bd);
3683      input += 8;
3684      outptr += 8;
3685    }
3686  }
3687
3688  if (optimised_cols) {
3689    idct8_sse2(inptr);
3690
3691    // Final round & shift and Reconstruction and Store
3692    {
3693      __m128i d[8];
3694      for (i = 0; i < 8; i++) {
3695        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
3696        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
3697        inptr[i] = _mm_srai_epi16(inptr[i], 5);
3698        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
3699        // Store
3700        _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
3701      }
3702    }
3703  } else {
3704    // Run the un-optimised column transform
3705    tran_low_t temp_in[8], temp_out[8];
3706    for (i = 0; i < 8; ++i) {
3707      for (j = 0; j < 8; ++j)
3708        temp_in[j] = out[j * 8 + i];
3709      vpx_highbd_idct8_c(temp_in, temp_out, bd);
3710      for (j = 0; j < 8; ++j) {
3711        dest[j * stride + i] = highbd_clip_pixel_add(
3712            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
3713      }
3714    }
3715  }
3716}
3717
3718void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
3719                                    int stride, int bd) {
3720  tran_low_t out[8 * 8] = { 0 };
3721  tran_low_t *outptr = out;
3722  int i, j, test;
3723  __m128i inptr[8];
3724  __m128i min_input, max_input, temp1, temp2, sign_bits;
3725  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3726  const __m128i zero = _mm_set1_epi16(0);
3727  const __m128i sixteen = _mm_set1_epi16(16);
3728  const __m128i max = _mm_set1_epi16(6201);
3729  const __m128i min = _mm_set1_epi16(-6201);
3730  int optimised_cols = 0;
3731
3732  // Load input into __m128i & pack to 16 bits
3733  for (i = 0; i < 8; i++) {
3734    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
3735    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
3736    inptr[i] = _mm_packs_epi32(temp1, temp2);
3737  }
3738
3739  // Find the min & max for the row transform
3740  // only first 4 row has non-zero coefs
3741  max_input = _mm_max_epi16(inptr[0], inptr[1]);
3742  min_input = _mm_min_epi16(inptr[0], inptr[1]);
3743  for (i = 2; i < 4; i++) {
3744    max_input = _mm_max_epi16(max_input, inptr[i]);
3745    min_input = _mm_min_epi16(min_input, inptr[i]);
3746  }
3747  max_input = _mm_cmpgt_epi16(max_input, max);
3748  min_input = _mm_cmplt_epi16(min_input, min);
3749  temp1 = _mm_or_si128(max_input, min_input);
3750  test = _mm_movemask_epi8(temp1);
3751
3752  if (!test) {
3753    // Do the row transform
3754    idct8_sse2(inptr);
3755
3756    // Find the min & max for the column transform
3757    // N.B. Only first 4 cols contain non-zero coeffs
3758    max_input = _mm_max_epi16(inptr[0], inptr[1]);
3759    min_input = _mm_min_epi16(inptr[0], inptr[1]);
3760    for (i = 2; i < 8; i++) {
3761      max_input = _mm_max_epi16(max_input, inptr[i]);
3762      min_input = _mm_min_epi16(min_input, inptr[i]);
3763    }
3764    max_input = _mm_cmpgt_epi16(max_input, max);
3765    min_input = _mm_cmplt_epi16(min_input, min);
3766    temp1 = _mm_or_si128(max_input, min_input);
3767    test = _mm_movemask_epi8(temp1);
3768
3769    if (test) {
3770      // Use fact only first 4 rows contain non-zero coeffs
3771      array_transpose_4X8(inptr, inptr);
3772      for (i = 0; i < 4; i++) {
3773        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
3774        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
3775        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
3776        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
3777        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
3778      }
3779    } else {
3780      // Set to use the optimised transform for the column
3781      optimised_cols = 1;
3782    }
3783  } else {
3784    // Run the un-optimised row transform
3785    for (i = 0; i < 4; ++i) {
3786      vpx_highbd_idct8_c(input, outptr, bd);
3787      input += 8;
3788      outptr += 8;
3789    }
3790  }
3791
3792  if (optimised_cols) {
3793    idct8_sse2(inptr);
3794
3795    // Final round & shift and Reconstruction and Store
3796    {
3797      __m128i d[8];
3798      for (i = 0; i < 8; i++) {
3799        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
3800        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
3801        inptr[i] = _mm_srai_epi16(inptr[i], 5);
3802        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
3803        // Store
3804        _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
3805      }
3806    }
3807  } else {
3808    // Run the un-optimised column transform
3809    tran_low_t temp_in[8], temp_out[8];
3810    for (i = 0; i < 8; ++i) {
3811      for (j = 0; j < 8; ++j)
3812        temp_in[j] = out[j * 8 + i];
3813      vpx_highbd_idct8_c(temp_in, temp_out, bd);
3814      for (j = 0; j < 8; ++j) {
3815        dest[j * stride + i] = highbd_clip_pixel_add(
3816            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
3817      }
3818    }
3819  }
3820}
3821
3822void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
3823                                       int stride, int bd) {
3824  tran_low_t out[16 * 16];
3825  tran_low_t *outptr = out;
3826  int i, j, test;
3827  __m128i inptr[32];
3828  __m128i min_input, max_input, temp1, temp2, sign_bits;
3829  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3830  const __m128i zero = _mm_set1_epi16(0);
3831  const __m128i rounding = _mm_set1_epi16(32);
3832  const __m128i max = _mm_set1_epi16(3155);
3833  const __m128i min = _mm_set1_epi16(-3155);
3834  int optimised_cols = 0;
3835
3836  // Load input into __m128i & pack to 16 bits
3837  for (i = 0; i < 16; i++) {
3838    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
3839    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
3840    inptr[i] = _mm_packs_epi32(temp1, temp2);
3841    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
3842    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
3843    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
3844  }
3845
3846  // Find the min & max for the row transform
3847  max_input = _mm_max_epi16(inptr[0], inptr[1]);
3848  min_input = _mm_min_epi16(inptr[0], inptr[1]);
3849  for (i = 2; i < 32; i++) {
3850    max_input = _mm_max_epi16(max_input, inptr[i]);
3851    min_input = _mm_min_epi16(min_input, inptr[i]);
3852  }
3853  max_input = _mm_cmpgt_epi16(max_input, max);
3854  min_input = _mm_cmplt_epi16(min_input, min);
3855  temp1 = _mm_or_si128(max_input, min_input);
3856  test = _mm_movemask_epi8(temp1);
3857
3858  if (!test) {
3859    // Do the row transform
3860    idct16_sse2(inptr, inptr + 16);
3861
3862    // Find the min & max for the column transform
3863    max_input = _mm_max_epi16(inptr[0], inptr[1]);
3864    min_input = _mm_min_epi16(inptr[0], inptr[1]);
3865    for (i = 2; i < 32; i++) {
3866      max_input = _mm_max_epi16(max_input, inptr[i]);
3867      min_input = _mm_min_epi16(min_input, inptr[i]);
3868    }
3869    max_input = _mm_cmpgt_epi16(max_input, max);
3870    min_input = _mm_cmplt_epi16(min_input, min);
3871    temp1 = _mm_or_si128(max_input, min_input);
3872    test = _mm_movemask_epi8(temp1);
3873
3874    if (test) {
3875      array_transpose_16x16(inptr, inptr + 16);
3876      for (i = 0; i < 16; i++) {
3877        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
3878        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
3879        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
3880        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
3881        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
3882        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
3883        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
3884        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
3885        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
3886        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
3887      }
3888    } else {
3889      // Set to use the optimised transform for the column
3890      optimised_cols = 1;
3891    }
3892  } else {
3893    // Run the un-optimised row transform
3894    for (i = 0; i < 16; ++i) {
3895      vpx_highbd_idct16_c(input, outptr, bd);
3896      input += 16;
3897      outptr += 16;
3898    }
3899  }
3900
3901  if (optimised_cols) {
3902    idct16_sse2(inptr, inptr + 16);
3903
3904    // Final round & shift and Reconstruction and Store
3905    {
3906      __m128i d[2];
3907      for (i = 0; i < 16; i++) {
3908        inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);
3909        inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
3910        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
3911        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
3912        inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);
3913        inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
3914        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);
3915        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
3916        // Store
3917        _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
3918        _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
3919      }
3920    }
3921  } else {
3922    // Run the un-optimised column transform
3923    tran_low_t temp_in[16], temp_out[16];
3924    for (i = 0; i < 16; ++i) {
3925      for (j = 0; j < 16; ++j)
3926        temp_in[j] = out[j * 16 + i];
3927      vpx_highbd_idct16_c(temp_in, temp_out, bd);
3928      for (j = 0; j < 16; ++j) {
3929        dest[j * stride + i] = highbd_clip_pixel_add(
3930            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
3931      }
3932    }
3933  }
3934}
3935
3936void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
3937                                      int stride, int bd) {
3938  tran_low_t out[16 * 16] = { 0 };
3939  tran_low_t *outptr = out;
3940  int i, j, test;
3941  __m128i inptr[32];
3942  __m128i min_input, max_input, temp1, temp2, sign_bits;
3943  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3944  const __m128i zero = _mm_set1_epi16(0);
3945  const __m128i rounding = _mm_set1_epi16(32);
3946  const __m128i max = _mm_set1_epi16(3155);
3947  const __m128i min = _mm_set1_epi16(-3155);
3948  int optimised_cols = 0;
3949
3950  // Load input into __m128i & pack to 16 bits
3951  for (i = 0; i < 16; i++) {
3952    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
3953    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
3954    inptr[i] = _mm_packs_epi32(temp1, temp2);
3955    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
3956    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
3957    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
3958  }
3959
3960  // Find the min & max for the row transform
3961  // Since all non-zero dct coefficients are in upper-left 4x4 area,
3962  // we only need to consider first 4 rows here.
3963  max_input = _mm_max_epi16(inptr[0], inptr[1]);
3964  min_input = _mm_min_epi16(inptr[0], inptr[1]);
3965  for (i = 2; i < 4; i++) {
3966    max_input = _mm_max_epi16(max_input, inptr[i]);
3967    min_input = _mm_min_epi16(min_input, inptr[i]);
3968  }
3969  max_input = _mm_cmpgt_epi16(max_input, max);
3970  min_input = _mm_cmplt_epi16(min_input, min);
3971  temp1 = _mm_or_si128(max_input, min_input);
3972  test = _mm_movemask_epi8(temp1);
3973
3974  if (!test) {
3975    // Do the row transform (N.B. This transposes inptr)
3976    idct16_sse2(inptr, inptr + 16);
3977
3978    // Find the min & max for the column transform
3979    // N.B. Only first 4 cols contain non-zero coeffs
3980    max_input = _mm_max_epi16(inptr[0], inptr[1]);
3981    min_input = _mm_min_epi16(inptr[0], inptr[1]);
3982    for (i = 2; i < 16; i++) {
3983      max_input = _mm_max_epi16(max_input, inptr[i]);
3984      min_input = _mm_min_epi16(min_input, inptr[i]);
3985    }
3986    max_input = _mm_cmpgt_epi16(max_input, max);
3987    min_input = _mm_cmplt_epi16(min_input, min);
3988    temp1 = _mm_or_si128(max_input, min_input);
3989    test = _mm_movemask_epi8(temp1);
3990
3991    if (test) {
3992      // Use fact only first 4 rows contain non-zero coeffs
3993      array_transpose_8x8(inptr, inptr);
3994      array_transpose_8x8(inptr + 8, inptr + 16);
3995      for (i = 0; i < 4; i++) {
3996        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
3997        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
3998        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
3999        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
4000        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
4001        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
4002        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
4003        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
4004        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
4005        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
4006      }
4007    } else {
4008      // Set to use the optimised transform for the column
4009      optimised_cols = 1;
4010    }
4011  } else {
4012    // Run the un-optimised row transform
4013    for (i = 0; i < 4; ++i) {
4014      vpx_highbd_idct16_c(input, outptr, bd);
4015      input += 16;
4016      outptr += 16;
4017    }
4018  }
4019
4020  if (optimised_cols) {
4021    idct16_sse2(inptr, inptr + 16);
4022
4023    // Final round & shift and Reconstruction and Store
4024    {
4025      __m128i d[2];
4026      for (i = 0; i < 16; i++) {
4027        inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);
4028        inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
4029        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
4030        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
4031        inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);
4032        inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
4033        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);
4034        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
4035        // Store
4036        _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
4037        _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
4038      }
4039    }
4040  } else {
4041    // Run the un-optimised column transform
4042    tran_low_t temp_in[16], temp_out[16];
4043    for (i = 0; i < 16; ++i) {
4044      for (j = 0; j < 16; ++j)
4045        temp_in[j] = out[j * 16 + i];
4046      vpx_highbd_idct16_c(temp_in, temp_out, bd);
4047      for (j = 0; j < 16; ++j) {
4048        dest[j * stride + i] = highbd_clip_pixel_add(
4049            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
4050      }
4051    }
4052  }
4053}
4054#endif  // CONFIG_VP9_HIGHBITDEPTH
4055