1/*
2 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <emmintrin.h>  // SSE2
13
14#include "./vp9_rtcd.h"
15#include "./vpx_dsp_rtcd.h"
16#include "vpx_dsp/txfm_common.h"
17#include "vpx_dsp/x86/fwd_txfm_sse2.h"
18#include "vpx_dsp/x86/txfm_common_sse2.h"
19#include "vpx_ports/mem.h"
20
21static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
22                                   int stride) {
23  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
24  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
25  __m128i mask;
26
27  in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
28  in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
29  in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
30  in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
31
32  in[0] = _mm_slli_epi16(in[0], 4);
33  in[1] = _mm_slli_epi16(in[1], 4);
34  in[2] = _mm_slli_epi16(in[2], 4);
35  in[3] = _mm_slli_epi16(in[3], 4);
36
37  mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
38  in[0] = _mm_add_epi16(in[0], mask);
39  in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
40}
41
42static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) {
43  const __m128i kOne = _mm_set1_epi16(1);
44  __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
45  __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
46  __m128i out01 = _mm_add_epi16(in01, kOne);
47  __m128i out23 = _mm_add_epi16(in23, kOne);
48  out01 = _mm_srai_epi16(out01, 2);
49  out23 = _mm_srai_epi16(out23, 2);
50  store_output(&out01, (output + 0 * 8));
51  store_output(&out23, (output + 1 * 8));
52}
53
54static INLINE void transpose_4x4(__m128i *res) {
55  // Combine and transpose
56  // 00 01 02 03 20 21 22 23
57  // 10 11 12 13 30 31 32 33
58  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
59  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
60
61  // 00 10 01 11 02 12 03 13
62  // 20 30 21 31 22 32 23 33
63  res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
64  res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
65
66  // 00 10 20 30 01 11 21 31
67  // 02 12 22 32 03 13 23 33
68  // only use the first 4 16-bit integers
69  res[1] = _mm_unpackhi_epi64(res[0], res[0]);
70  res[3] = _mm_unpackhi_epi64(res[2], res[2]);
71}
72
73static void fdct4_sse2(__m128i *in) {
74  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
75  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
76  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
77  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
78  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
79
80  __m128i u[4], v[4];
81  u[0]=_mm_unpacklo_epi16(in[0], in[1]);
82  u[1]=_mm_unpacklo_epi16(in[3], in[2]);
83
84  v[0] = _mm_add_epi16(u[0], u[1]);
85  v[1] = _mm_sub_epi16(u[0], u[1]);
86
87  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);  // 0
88  u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16);  // 2
89  u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24);  // 1
90  u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08);  // 3
91
92  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
93  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
94  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
95  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
96  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
97  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
98  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
99  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
100
101  in[0] = _mm_packs_epi32(u[0], u[1]);
102  in[1] = _mm_packs_epi32(u[2], u[3]);
103  transpose_4x4(in);
104}
105
106static void fadst4_sse2(__m128i *in) {
107  const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
108  const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
109  const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
110  const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
111  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
112  const __m128i kZero = _mm_set1_epi16(0);
113  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
114  __m128i u[8], v[8];
115  __m128i in7 = _mm_add_epi16(in[0], in[1]);
116
117  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
118  u[1] = _mm_unpacklo_epi16(in[2], in[3]);
119  u[2] = _mm_unpacklo_epi16(in7, kZero);
120  u[3] = _mm_unpacklo_epi16(in[2], kZero);
121  u[4] = _mm_unpacklo_epi16(in[3], kZero);
122
123  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02);  // s0 + s2
124  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04);  // s4 + s5
125  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x1
126  v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01);  // s1 - s3
127  v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02);  // -s4 + s6
128  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s4
129  v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
130
131  u[0] = _mm_add_epi32(v[0], v[1]);
132  u[1] = _mm_sub_epi32(v[2], v[6]);
133  u[2] = _mm_add_epi32(v[3], v[4]);
134  u[3] = _mm_sub_epi32(u[2], u[0]);
135  u[4] = _mm_slli_epi32(v[5], 2);
136  u[5] = _mm_sub_epi32(u[4], v[5]);
137  u[6] = _mm_add_epi32(u[3], u[5]);
138
139  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
140  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
141  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
142  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
143
144  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
145  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
146  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
147  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
148
149  in[0] = _mm_packs_epi32(u[0], u[2]);
150  in[1] = _mm_packs_epi32(u[1], u[3]);
151  transpose_4x4(in);
152}
153
154void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output,
155                     int stride, int tx_type) {
156  __m128i in[4];
157
158  switch (tx_type) {
159    case DCT_DCT:
160      vpx_fdct4x4_sse2(input, output, stride);
161      break;
162    case ADST_DCT:
163      load_buffer_4x4(input, in, stride);
164      fadst4_sse2(in);
165      fdct4_sse2(in);
166      write_buffer_4x4(output, in);
167      break;
168    case DCT_ADST:
169      load_buffer_4x4(input, in, stride);
170      fdct4_sse2(in);
171      fadst4_sse2(in);
172      write_buffer_4x4(output, in);
173      break;
174    case ADST_ADST:
175      load_buffer_4x4(input, in, stride);
176      fadst4_sse2(in);
177      fadst4_sse2(in);
178      write_buffer_4x4(output, in);
179      break;
180   default:
181     assert(0);
182     break;
183  }
184}
185
186void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
187                            int16_t* coeff_ptr, intptr_t n_coeffs,
188                            int skip_block, const int16_t* zbin_ptr,
189                            const int16_t* round_ptr, const int16_t* quant_ptr,
190                            const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
191                            int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
192                            uint16_t* eob_ptr,
193                            const int16_t* scan_ptr,
194                            const int16_t* iscan_ptr) {
195  __m128i zero;
196  int pass;
197  // Constants
198  //    When we use them, in one case, they are all the same. In all others
199  //    it's a pair of them that we need to repeat four times. This is done
200  //    by constructing the 32 bit constant corresponding to that pair.
201  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
202  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
203  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
204  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
205  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
206  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
207  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
208  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
209  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
210  // Load input
211  __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
212  __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
213  __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
214  __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
215  __m128i in4  = _mm_load_si128((const __m128i *)(input + 4 * stride));
216  __m128i in5  = _mm_load_si128((const __m128i *)(input + 5 * stride));
217  __m128i in6  = _mm_load_si128((const __m128i *)(input + 6 * stride));
218  __m128i in7  = _mm_load_si128((const __m128i *)(input + 7 * stride));
219  __m128i *in[8];
220  int index = 0;
221
222  (void)scan_ptr;
223  (void)zbin_ptr;
224  (void)quant_shift_ptr;
225  (void)coeff_ptr;
226
227  // Pre-condition input (shift by two)
228  in0 = _mm_slli_epi16(in0, 2);
229  in1 = _mm_slli_epi16(in1, 2);
230  in2 = _mm_slli_epi16(in2, 2);
231  in3 = _mm_slli_epi16(in3, 2);
232  in4 = _mm_slli_epi16(in4, 2);
233  in5 = _mm_slli_epi16(in5, 2);
234  in6 = _mm_slli_epi16(in6, 2);
235  in7 = _mm_slli_epi16(in7, 2);
236
237  in[0] = &in0;
238  in[1] = &in1;
239  in[2] = &in2;
240  in[3] = &in3;
241  in[4] = &in4;
242  in[5] = &in5;
243  in[6] = &in6;
244  in[7] = &in7;
245
246  // We do two passes, first the columns, then the rows. The results of the
247  // first pass are transposed so that the same column code can be reused. The
248  // results of the second pass are also transposed so that the rows (processed
249  // as columns) are put back in row positions.
250  for (pass = 0; pass < 2; pass++) {
251    // To store results of each pass before the transpose.
252    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
253    // Add/subtract
254    const __m128i q0 = _mm_add_epi16(in0, in7);
255    const __m128i q1 = _mm_add_epi16(in1, in6);
256    const __m128i q2 = _mm_add_epi16(in2, in5);
257    const __m128i q3 = _mm_add_epi16(in3, in4);
258    const __m128i q4 = _mm_sub_epi16(in3, in4);
259    const __m128i q5 = _mm_sub_epi16(in2, in5);
260    const __m128i q6 = _mm_sub_epi16(in1, in6);
261    const __m128i q7 = _mm_sub_epi16(in0, in7);
262    // Work on first four results
263    {
264      // Add/subtract
265      const __m128i r0 = _mm_add_epi16(q0, q3);
266      const __m128i r1 = _mm_add_epi16(q1, q2);
267      const __m128i r2 = _mm_sub_epi16(q1, q2);
268      const __m128i r3 = _mm_sub_epi16(q0, q3);
269      // Interleave to do the multiply by constants which gets us into 32bits
270      const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
271      const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
272      const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
273      const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
274      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
275      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
276      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
277      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
278      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
279      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
280      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
281      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
282      // dct_const_round_shift
283      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
284      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
285      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
286      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
287      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
288      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
289      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
290      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
291      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
292      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
293      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
294      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
295      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
296      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
297      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
298      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
299      // Combine
300      res0 = _mm_packs_epi32(w0, w1);
301      res4 = _mm_packs_epi32(w2, w3);
302      res2 = _mm_packs_epi32(w4, w5);
303      res6 = _mm_packs_epi32(w6, w7);
304    }
305    // Work on next four results
306    {
307      // Interleave to do the multiply by constants which gets us into 32bits
308      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
309      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
310      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
311      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
312      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
313      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
314      // dct_const_round_shift
315      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
316      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
317      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
318      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
319      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
320      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
321      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
322      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
323      // Combine
324      const __m128i r0 = _mm_packs_epi32(s0, s1);
325      const __m128i r1 = _mm_packs_epi32(s2, s3);
326      // Add/subtract
327      const __m128i x0 = _mm_add_epi16(q4, r0);
328      const __m128i x1 = _mm_sub_epi16(q4, r0);
329      const __m128i x2 = _mm_sub_epi16(q7, r1);
330      const __m128i x3 = _mm_add_epi16(q7, r1);
331      // Interleave to do the multiply by constants which gets us into 32bits
332      const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
333      const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
334      const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
335      const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
336      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
337      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
338      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
339      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
340      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
341      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
342      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
343      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
344      // dct_const_round_shift
345      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
346      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
347      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
348      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
349      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
350      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
351      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
352      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
353      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
354      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
355      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
356      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
357      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
358      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
359      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
360      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
361      // Combine
362      res1 = _mm_packs_epi32(w0, w1);
363      res7 = _mm_packs_epi32(w2, w3);
364      res5 = _mm_packs_epi32(w4, w5);
365      res3 = _mm_packs_epi32(w6, w7);
366    }
367    // Transpose the 8x8.
368    {
369      // 00 01 02 03 04 05 06 07
370      // 10 11 12 13 14 15 16 17
371      // 20 21 22 23 24 25 26 27
372      // 30 31 32 33 34 35 36 37
373      // 40 41 42 43 44 45 46 47
374      // 50 51 52 53 54 55 56 57
375      // 60 61 62 63 64 65 66 67
376      // 70 71 72 73 74 75 76 77
377      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
378      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
379      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
380      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
381      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
382      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
383      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
384      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
385      // 00 10 01 11 02 12 03 13
386      // 20 30 21 31 22 32 23 33
387      // 04 14 05 15 06 16 07 17
388      // 24 34 25 35 26 36 27 37
389      // 40 50 41 51 42 52 43 53
390      // 60 70 61 71 62 72 63 73
391      // 54 54 55 55 56 56 57 57
392      // 64 74 65 75 66 76 67 77
393      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
394      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
395      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
396      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
397      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
398      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
399      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
400      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
401      // 00 10 20 30 01 11 21 31
402      // 40 50 60 70 41 51 61 71
403      // 02 12 22 32 03 13 23 33
404      // 42 52 62 72 43 53 63 73
405      // 04 14 24 34 05 15 21 36
406      // 44 54 64 74 45 55 61 76
407      // 06 16 26 36 07 17 27 37
408      // 46 56 66 76 47 57 67 77
409      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
410      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
411      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
412      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
413      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
414      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
415      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
416      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
417      // 00 10 20 30 40 50 60 70
418      // 01 11 21 31 41 51 61 71
419      // 02 12 22 32 42 52 62 72
420      // 03 13 23 33 43 53 63 73
421      // 04 14 24 34 44 54 64 74
422      // 05 15 25 35 45 55 65 75
423      // 06 16 26 36 46 56 66 76
424      // 07 17 27 37 47 57 67 77
425    }
426  }
427  // Post-condition output and store it
428  {
429    // Post-condition (division by two)
430    //    division of two 16 bits signed numbers using shifts
431    //    n / 2 = (n - (n >> 15)) >> 1
432    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
433    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
434    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
435    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
436    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
437    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
438    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
439    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
440    in0 = _mm_sub_epi16(in0, sign_in0);
441    in1 = _mm_sub_epi16(in1, sign_in1);
442    in2 = _mm_sub_epi16(in2, sign_in2);
443    in3 = _mm_sub_epi16(in3, sign_in3);
444    in4 = _mm_sub_epi16(in4, sign_in4);
445    in5 = _mm_sub_epi16(in5, sign_in5);
446    in6 = _mm_sub_epi16(in6, sign_in6);
447    in7 = _mm_sub_epi16(in7, sign_in7);
448    in0 = _mm_srai_epi16(in0, 1);
449    in1 = _mm_srai_epi16(in1, 1);
450    in2 = _mm_srai_epi16(in2, 1);
451    in3 = _mm_srai_epi16(in3, 1);
452    in4 = _mm_srai_epi16(in4, 1);
453    in5 = _mm_srai_epi16(in5, 1);
454    in6 = _mm_srai_epi16(in6, 1);
455    in7 = _mm_srai_epi16(in7, 1);
456  }
457
458  iscan_ptr += n_coeffs;
459  qcoeff_ptr += n_coeffs;
460  dqcoeff_ptr += n_coeffs;
461  n_coeffs = -n_coeffs;
462  zero = _mm_setzero_si128();
463
464  if (!skip_block) {
465    __m128i eob;
466    __m128i round, quant, dequant;
467    {
468      __m128i coeff0, coeff1;
469
470      // Setup global values
471      {
472        round = _mm_load_si128((const __m128i*)round_ptr);
473        quant = _mm_load_si128((const __m128i*)quant_ptr);
474        dequant = _mm_load_si128((const __m128i*)dequant_ptr);
475      }
476
477      {
478        __m128i coeff0_sign, coeff1_sign;
479        __m128i qcoeff0, qcoeff1;
480        __m128i qtmp0, qtmp1;
481        // Do DC and first 15 AC
482        coeff0 = *in[0];
483        coeff1 = *in[1];
484
485        // Poor man's sign extract
486        coeff0_sign = _mm_srai_epi16(coeff0, 15);
487        coeff1_sign = _mm_srai_epi16(coeff1, 15);
488        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
489        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
490        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
491        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
492
493        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
494        round = _mm_unpackhi_epi64(round, round);
495        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
496        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
497        quant = _mm_unpackhi_epi64(quant, quant);
498        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
499
500        // Reinsert signs
501        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
502        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
503        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
504        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
505
506        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
507        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
508
509        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
510        dequant = _mm_unpackhi_epi64(dequant, dequant);
511        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
512
513        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
514        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
515      }
516
517      {
518        // Scan for eob
519        __m128i zero_coeff0, zero_coeff1;
520        __m128i nzero_coeff0, nzero_coeff1;
521        __m128i iscan0, iscan1;
522        __m128i eob1;
523        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
524        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
525        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
526        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
527        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
528        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
529        // Add one to convert from indices to counts
530        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
531        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
532        eob = _mm_and_si128(iscan0, nzero_coeff0);
533        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
534        eob = _mm_max_epi16(eob, eob1);
535      }
536      n_coeffs += 8 * 2;
537    }
538
539    // AC only loop
540    index = 2;
541    while (n_coeffs < 0) {
542      __m128i coeff0, coeff1;
543      {
544        __m128i coeff0_sign, coeff1_sign;
545        __m128i qcoeff0, qcoeff1;
546        __m128i qtmp0, qtmp1;
547
548        assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1);
549        coeff0 = *in[index];
550        coeff1 = *in[index + 1];
551
552        // Poor man's sign extract
553        coeff0_sign = _mm_srai_epi16(coeff0, 15);
554        coeff1_sign = _mm_srai_epi16(coeff1, 15);
555        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
556        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
557        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
558        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
559
560        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
561        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
562        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
563        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
564
565        // Reinsert signs
566        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
567        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
568        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
569        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
570
571        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
572        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
573
574        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
575        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
576
577        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
578        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
579      }
580
581      {
582        // Scan for eob
583        __m128i zero_coeff0, zero_coeff1;
584        __m128i nzero_coeff0, nzero_coeff1;
585        __m128i iscan0, iscan1;
586        __m128i eob0, eob1;
587        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
588        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
589        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
590        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
591        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
592        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
593        // Add one to convert from indices to counts
594        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
595        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
596        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
597        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
598        eob0 = _mm_max_epi16(eob0, eob1);
599        eob = _mm_max_epi16(eob, eob0);
600      }
601      n_coeffs += 8 * 2;
602      index += 2;
603    }
604
605    // Accumulate EOB
606    {
607      __m128i eob_shuffled;
608      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
609      eob = _mm_max_epi16(eob, eob_shuffled);
610      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
611      eob = _mm_max_epi16(eob, eob_shuffled);
612      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
613      eob = _mm_max_epi16(eob, eob_shuffled);
614      *eob_ptr = _mm_extract_epi16(eob, 1);
615    }
616  } else {
617    do {
618      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
619      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
620      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
621      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
622      n_coeffs += 8 * 2;
623    } while (n_coeffs < 0);
624    *eob_ptr = 0;
625  }
626}
627
628// load 8x8 array
629static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
630                                   int stride) {
631  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * stride));
632  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * stride));
633  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * stride));
634  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * stride));
635  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * stride));
636  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * stride));
637  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * stride));
638  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * stride));
639
640  in[0] = _mm_slli_epi16(in[0], 2);
641  in[1] = _mm_slli_epi16(in[1], 2);
642  in[2] = _mm_slli_epi16(in[2], 2);
643  in[3] = _mm_slli_epi16(in[3], 2);
644  in[4] = _mm_slli_epi16(in[4], 2);
645  in[5] = _mm_slli_epi16(in[5], 2);
646  in[6] = _mm_slli_epi16(in[6], 2);
647  in[7] = _mm_slli_epi16(in[7], 2);
648}
649
650// right shift and rounding
651static INLINE void right_shift_8x8(__m128i *res, const int bit) {
652  __m128i sign0 = _mm_srai_epi16(res[0], 15);
653  __m128i sign1 = _mm_srai_epi16(res[1], 15);
654  __m128i sign2 = _mm_srai_epi16(res[2], 15);
655  __m128i sign3 = _mm_srai_epi16(res[3], 15);
656  __m128i sign4 = _mm_srai_epi16(res[4], 15);
657  __m128i sign5 = _mm_srai_epi16(res[5], 15);
658  __m128i sign6 = _mm_srai_epi16(res[6], 15);
659  __m128i sign7 = _mm_srai_epi16(res[7], 15);
660
661  if (bit == 2) {
662    const __m128i const_rounding = _mm_set1_epi16(1);
663    res[0] = _mm_add_epi16(res[0], const_rounding);
664    res[1] = _mm_add_epi16(res[1], const_rounding);
665    res[2] = _mm_add_epi16(res[2], const_rounding);
666    res[3] = _mm_add_epi16(res[3], const_rounding);
667    res[4] = _mm_add_epi16(res[4], const_rounding);
668    res[5] = _mm_add_epi16(res[5], const_rounding);
669    res[6] = _mm_add_epi16(res[6], const_rounding);
670    res[7] = _mm_add_epi16(res[7], const_rounding);
671  }
672
673  res[0] = _mm_sub_epi16(res[0], sign0);
674  res[1] = _mm_sub_epi16(res[1], sign1);
675  res[2] = _mm_sub_epi16(res[2], sign2);
676  res[3] = _mm_sub_epi16(res[3], sign3);
677  res[4] = _mm_sub_epi16(res[4], sign4);
678  res[5] = _mm_sub_epi16(res[5], sign5);
679  res[6] = _mm_sub_epi16(res[6], sign6);
680  res[7] = _mm_sub_epi16(res[7], sign7);
681
682  if (bit == 1) {
683    res[0] = _mm_srai_epi16(res[0], 1);
684    res[1] = _mm_srai_epi16(res[1], 1);
685    res[2] = _mm_srai_epi16(res[2], 1);
686    res[3] = _mm_srai_epi16(res[3], 1);
687    res[4] = _mm_srai_epi16(res[4], 1);
688    res[5] = _mm_srai_epi16(res[5], 1);
689    res[6] = _mm_srai_epi16(res[6], 1);
690    res[7] = _mm_srai_epi16(res[7], 1);
691  } else {
692    res[0] = _mm_srai_epi16(res[0], 2);
693    res[1] = _mm_srai_epi16(res[1], 2);
694    res[2] = _mm_srai_epi16(res[2], 2);
695    res[3] = _mm_srai_epi16(res[3], 2);
696    res[4] = _mm_srai_epi16(res[4], 2);
697    res[5] = _mm_srai_epi16(res[5], 2);
698    res[6] = _mm_srai_epi16(res[6], 2);
699    res[7] = _mm_srai_epi16(res[7], 2);
700  }
701}
702
703// write 8x8 array
704static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res,
705                                    int stride) {
706  store_output(&res[0], (output + 0 * stride));
707  store_output(&res[1], (output + 1 * stride));
708  store_output(&res[2], (output + 2 * stride));
709  store_output(&res[3], (output + 3 * stride));
710  store_output(&res[4], (output + 4 * stride));
711  store_output(&res[5], (output + 5 * stride));
712  store_output(&res[6], (output + 6 * stride));
713  store_output(&res[7], (output + 7 * stride));
714}
715
716// perform in-place transpose
717static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
718  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
719  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
720  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
721  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
722  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
723  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
724  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
725  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
726  // 00 10 01 11 02 12 03 13
727  // 20 30 21 31 22 32 23 33
728  // 04 14 05 15 06 16 07 17
729  // 24 34 25 35 26 36 27 37
730  // 40 50 41 51 42 52 43 53
731  // 60 70 61 71 62 72 63 73
732  // 44 54 45 55 46 56 47 57
733  // 64 74 65 75 66 76 67 77
734  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
735  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
736  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
737  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
738  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
739  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
740  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
741  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
742  // 00 10 20 30 01 11 21 31
743  // 40 50 60 70 41 51 61 71
744  // 02 12 22 32 03 13 23 33
745  // 42 52 62 72 43 53 63 73
746  // 04 14 24 34 05 15 25 35
747  // 44 54 64 74 45 55 65 75
748  // 06 16 26 36 07 17 27 37
749  // 46 56 66 76 47 57 67 77
750  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
751  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
752  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
753  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
754  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
755  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
756  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
757  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
758  // 00 10 20 30 40 50 60 70
759  // 01 11 21 31 41 51 61 71
760  // 02 12 22 32 42 52 62 72
761  // 03 13 23 33 43 53 63 73
762  // 04 14 24 34 44 54 64 74
763  // 05 15 25 35 45 55 65 75
764  // 06 16 26 36 46 56 66 76
765  // 07 17 27 37 47 57 67 77
766}
767
768static void fdct8_sse2(__m128i *in) {
769  // constants
770  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
771  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
772  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
773  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
774  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
775  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
776  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
777  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
778  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
779  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
780  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
781  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
782
783  // stage 1
784  s0 = _mm_add_epi16(in[0], in[7]);
785  s1 = _mm_add_epi16(in[1], in[6]);
786  s2 = _mm_add_epi16(in[2], in[5]);
787  s3 = _mm_add_epi16(in[3], in[4]);
788  s4 = _mm_sub_epi16(in[3], in[4]);
789  s5 = _mm_sub_epi16(in[2], in[5]);
790  s6 = _mm_sub_epi16(in[1], in[6]);
791  s7 = _mm_sub_epi16(in[0], in[7]);
792
793  u0 = _mm_add_epi16(s0, s3);
794  u1 = _mm_add_epi16(s1, s2);
795  u2 = _mm_sub_epi16(s1, s2);
796  u3 = _mm_sub_epi16(s0, s3);
797  // interleave and perform butterfly multiplication/addition
798  v0 = _mm_unpacklo_epi16(u0, u1);
799  v1 = _mm_unpackhi_epi16(u0, u1);
800  v2 = _mm_unpacklo_epi16(u2, u3);
801  v3 = _mm_unpackhi_epi16(u2, u3);
802
803  u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
804  u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
805  u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
806  u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
807  u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
808  u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
809  u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
810  u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
811
812  // shift and rounding
813  v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
814  v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
815  v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
816  v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
817  v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
818  v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
819  v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
820  v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
821
822  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
823  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
824  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
825  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
826  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
827  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
828  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
829  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
830
831  in[0] = _mm_packs_epi32(u0, u1);
832  in[2] = _mm_packs_epi32(u4, u5);
833  in[4] = _mm_packs_epi32(u2, u3);
834  in[6] = _mm_packs_epi32(u6, u7);
835
836  // stage 2
837  // interleave and perform butterfly multiplication/addition
838  u0 = _mm_unpacklo_epi16(s6, s5);
839  u1 = _mm_unpackhi_epi16(s6, s5);
840  v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
841  v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
842  v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
843  v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
844
845  // shift and rounding
846  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
847  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
848  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
849  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
850
851  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
852  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
853  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
854  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
855
856  u0 = _mm_packs_epi32(v0, v1);
857  u1 = _mm_packs_epi32(v2, v3);
858
859  // stage 3
860  s0 = _mm_add_epi16(s4, u0);
861  s1 = _mm_sub_epi16(s4, u0);
862  s2 = _mm_sub_epi16(s7, u1);
863  s3 = _mm_add_epi16(s7, u1);
864
865  // stage 4
866  u0 = _mm_unpacklo_epi16(s0, s3);
867  u1 = _mm_unpackhi_epi16(s0, s3);
868  u2 = _mm_unpacklo_epi16(s1, s2);
869  u3 = _mm_unpackhi_epi16(s1, s2);
870
871  v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
872  v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
873  v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
874  v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
875  v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
876  v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
877  v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
878  v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
879
880  // shift and rounding
881  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
882  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
883  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
884  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
885  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
886  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
887  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
888  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
889
890  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
891  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
892  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
893  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
894  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
895  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
896  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
897  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
898
899  in[1] = _mm_packs_epi32(v0, v1);
900  in[3] = _mm_packs_epi32(v4, v5);
901  in[5] = _mm_packs_epi32(v2, v3);
902  in[7] = _mm_packs_epi32(v6, v7);
903
904  // transpose
905  array_transpose_8x8(in, in);
906}
907
908static void fadst8_sse2(__m128i *in) {
909  // Constants
910  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
911  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
912  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
913  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
914  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
915  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
916  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
917  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
918  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
919  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
920  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
921  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
922  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
923  const __m128i k__const_0 = _mm_set1_epi16(0);
924  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
925
926  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
927  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
928  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
929  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
930  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
931
932  // properly aligned for butterfly input
933  in0  = in[7];
934  in1  = in[0];
935  in2  = in[5];
936  in3  = in[2];
937  in4  = in[3];
938  in5  = in[4];
939  in6  = in[1];
940  in7  = in[6];
941
942  // column transformation
943  // stage 1
944  // interleave and multiply/add into 32-bit integer
945  s0 = _mm_unpacklo_epi16(in0, in1);
946  s1 = _mm_unpackhi_epi16(in0, in1);
947  s2 = _mm_unpacklo_epi16(in2, in3);
948  s3 = _mm_unpackhi_epi16(in2, in3);
949  s4 = _mm_unpacklo_epi16(in4, in5);
950  s5 = _mm_unpackhi_epi16(in4, in5);
951  s6 = _mm_unpacklo_epi16(in6, in7);
952  s7 = _mm_unpackhi_epi16(in6, in7);
953
954  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
955  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
956  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
957  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
958  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
959  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
960  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
961  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
962  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
963  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
964  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
965  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
966  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
967  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
968  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
969  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
970
971  // addition
972  w0 = _mm_add_epi32(u0, u8);
973  w1 = _mm_add_epi32(u1, u9);
974  w2 = _mm_add_epi32(u2, u10);
975  w3 = _mm_add_epi32(u3, u11);
976  w4 = _mm_add_epi32(u4, u12);
977  w5 = _mm_add_epi32(u5, u13);
978  w6 = _mm_add_epi32(u6, u14);
979  w7 = _mm_add_epi32(u7, u15);
980  w8 = _mm_sub_epi32(u0, u8);
981  w9 = _mm_sub_epi32(u1, u9);
982  w10 = _mm_sub_epi32(u2, u10);
983  w11 = _mm_sub_epi32(u3, u11);
984  w12 = _mm_sub_epi32(u4, u12);
985  w13 = _mm_sub_epi32(u5, u13);
986  w14 = _mm_sub_epi32(u6, u14);
987  w15 = _mm_sub_epi32(u7, u15);
988
989  // shift and rounding
990  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
991  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
992  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
993  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
994  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
995  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
996  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
997  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
998  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
999  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
1000  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
1001  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
1002  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
1003  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
1004  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
1005  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
1006
1007  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
1008  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
1009  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
1010  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
1011  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
1012  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
1013  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
1014  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
1015  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
1016  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
1017  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
1018  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
1019  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
1020  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
1021  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
1022  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
1023
1024  // back to 16-bit and pack 8 integers into __m128i
1025  in[0] = _mm_packs_epi32(u0, u1);
1026  in[1] = _mm_packs_epi32(u2, u3);
1027  in[2] = _mm_packs_epi32(u4, u5);
1028  in[3] = _mm_packs_epi32(u6, u7);
1029  in[4] = _mm_packs_epi32(u8, u9);
1030  in[5] = _mm_packs_epi32(u10, u11);
1031  in[6] = _mm_packs_epi32(u12, u13);
1032  in[7] = _mm_packs_epi32(u14, u15);
1033
1034  // stage 2
1035  s0 = _mm_add_epi16(in[0], in[2]);
1036  s1 = _mm_add_epi16(in[1], in[3]);
1037  s2 = _mm_sub_epi16(in[0], in[2]);
1038  s3 = _mm_sub_epi16(in[1], in[3]);
1039  u0 = _mm_unpacklo_epi16(in[4], in[5]);
1040  u1 = _mm_unpackhi_epi16(in[4], in[5]);
1041  u2 = _mm_unpacklo_epi16(in[6], in[7]);
1042  u3 = _mm_unpackhi_epi16(in[6], in[7]);
1043
1044  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
1045  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
1046  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
1047  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
1048  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
1049  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
1050  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
1051  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
1052
1053  w0 = _mm_add_epi32(v0, v4);
1054  w1 = _mm_add_epi32(v1, v5);
1055  w2 = _mm_add_epi32(v2, v6);
1056  w3 = _mm_add_epi32(v3, v7);
1057  w4 = _mm_sub_epi32(v0, v4);
1058  w5 = _mm_sub_epi32(v1, v5);
1059  w6 = _mm_sub_epi32(v2, v6);
1060  w7 = _mm_sub_epi32(v3, v7);
1061
1062  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
1063  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
1064  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
1065  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
1066  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
1067  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
1068  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
1069  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
1070
1071  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
1072  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
1073  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
1074  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
1075  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
1076  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
1077  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
1078  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
1079
1080  // back to 16-bit intergers
1081  s4 = _mm_packs_epi32(u0, u1);
1082  s5 = _mm_packs_epi32(u2, u3);
1083  s6 = _mm_packs_epi32(u4, u5);
1084  s7 = _mm_packs_epi32(u6, u7);
1085
1086  // stage 3
1087  u0 = _mm_unpacklo_epi16(s2, s3);
1088  u1 = _mm_unpackhi_epi16(s2, s3);
1089  u2 = _mm_unpacklo_epi16(s6, s7);
1090  u3 = _mm_unpackhi_epi16(s6, s7);
1091
1092  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
1093  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
1094  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
1095  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
1096  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
1097  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
1098  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
1099  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
1100
1101  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
1102  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
1103  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
1104  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
1105  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
1106  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
1107  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
1108  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
1109
1110  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
1111  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
1112  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
1113  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
1114  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
1115  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
1116  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
1117  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
1118
1119  s2 = _mm_packs_epi32(v0, v1);
1120  s3 = _mm_packs_epi32(v2, v3);
1121  s6 = _mm_packs_epi32(v4, v5);
1122  s7 = _mm_packs_epi32(v6, v7);
1123
1124  // FIXME(jingning): do subtract using bit inversion?
1125  in[0] = s0;
1126  in[1] = _mm_sub_epi16(k__const_0, s4);
1127  in[2] = s6;
1128  in[3] = _mm_sub_epi16(k__const_0, s2);
1129  in[4] = s3;
1130  in[5] = _mm_sub_epi16(k__const_0, s7);
1131  in[6] = s5;
1132  in[7] = _mm_sub_epi16(k__const_0, s1);
1133
1134  // transpose
1135  array_transpose_8x8(in, in);
1136}
1137
1138void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output,
1139                     int stride, int tx_type) {
1140  __m128i in[8];
1141
1142  switch (tx_type) {
1143    case DCT_DCT:
1144      vpx_fdct8x8_sse2(input, output, stride);
1145      break;
1146    case ADST_DCT:
1147      load_buffer_8x8(input, in, stride);
1148      fadst8_sse2(in);
1149      fdct8_sse2(in);
1150      right_shift_8x8(in, 1);
1151      write_buffer_8x8(output, in, 8);
1152      break;
1153    case DCT_ADST:
1154      load_buffer_8x8(input, in, stride);
1155      fdct8_sse2(in);
1156      fadst8_sse2(in);
1157      right_shift_8x8(in, 1);
1158      write_buffer_8x8(output, in, 8);
1159      break;
1160    case ADST_ADST:
1161      load_buffer_8x8(input, in, stride);
1162      fadst8_sse2(in);
1163      fadst8_sse2(in);
1164      right_shift_8x8(in, 1);
1165      write_buffer_8x8(output, in, 8);
1166      break;
1167    default:
1168      assert(0);
1169      break;
1170  }
1171}
1172
1173static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,
1174                                     __m128i *in1, int stride) {
1175  // load first 8 columns
1176  load_buffer_8x8(input, in0, stride);
1177  load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
1178
1179  input += 8;
1180  // load second 8 columns
1181  load_buffer_8x8(input, in1, stride);
1182  load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
1183}
1184
1185static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
1186                                      __m128i *in1, int stride) {
1187  // write first 8 columns
1188  write_buffer_8x8(output, in0, stride);
1189  write_buffer_8x8(output + 8 * stride, in0 + 8, stride);
1190  // write second 8 columns
1191  output += 8;
1192  write_buffer_8x8(output, in1, stride);
1193  write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
1194}
1195
1196static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
1197  __m128i tbuf[8];
1198  array_transpose_8x8(res0, res0);
1199  array_transpose_8x8(res1, tbuf);
1200  array_transpose_8x8(res0 + 8, res1);
1201  array_transpose_8x8(res1 + 8, res1 + 8);
1202
1203  res0[8] = tbuf[0];
1204  res0[9] = tbuf[1];
1205  res0[10] = tbuf[2];
1206  res0[11] = tbuf[3];
1207  res0[12] = tbuf[4];
1208  res0[13] = tbuf[5];
1209  res0[14] = tbuf[6];
1210  res0[15] = tbuf[7];
1211}
1212
1213static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
1214  // perform rounding operations
1215  right_shift_8x8(res0, 2);
1216  right_shift_8x8(res0 + 8, 2);
1217  right_shift_8x8(res1, 2);
1218  right_shift_8x8(res1 + 8, 2);
1219}
1220
1221static void fdct16_8col(__m128i *in) {
1222  // perform 16x16 1-D DCT for 8 columns
1223  __m128i i[8], s[8], p[8], t[8], u[16], v[16];
1224  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
1225  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1226  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1227  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
1228  const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
1229  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1230  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
1231  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
1232  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
1233  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
1234  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
1235  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
1236  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
1237  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
1238  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
1239  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
1240  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
1241  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
1242  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1243
1244  // stage 1
1245  i[0] = _mm_add_epi16(in[0], in[15]);
1246  i[1] = _mm_add_epi16(in[1], in[14]);
1247  i[2] = _mm_add_epi16(in[2], in[13]);
1248  i[3] = _mm_add_epi16(in[3], in[12]);
1249  i[4] = _mm_add_epi16(in[4], in[11]);
1250  i[5] = _mm_add_epi16(in[5], in[10]);
1251  i[6] = _mm_add_epi16(in[6], in[9]);
1252  i[7] = _mm_add_epi16(in[7], in[8]);
1253
1254  s[0] = _mm_sub_epi16(in[7], in[8]);
1255  s[1] = _mm_sub_epi16(in[6], in[9]);
1256  s[2] = _mm_sub_epi16(in[5], in[10]);
1257  s[3] = _mm_sub_epi16(in[4], in[11]);
1258  s[4] = _mm_sub_epi16(in[3], in[12]);
1259  s[5] = _mm_sub_epi16(in[2], in[13]);
1260  s[6] = _mm_sub_epi16(in[1], in[14]);
1261  s[7] = _mm_sub_epi16(in[0], in[15]);
1262
1263  p[0] = _mm_add_epi16(i[0], i[7]);
1264  p[1] = _mm_add_epi16(i[1], i[6]);
1265  p[2] = _mm_add_epi16(i[2], i[5]);
1266  p[3] = _mm_add_epi16(i[3], i[4]);
1267  p[4] = _mm_sub_epi16(i[3], i[4]);
1268  p[5] = _mm_sub_epi16(i[2], i[5]);
1269  p[6] = _mm_sub_epi16(i[1], i[6]);
1270  p[7] = _mm_sub_epi16(i[0], i[7]);
1271
1272  u[0] = _mm_add_epi16(p[0], p[3]);
1273  u[1] = _mm_add_epi16(p[1], p[2]);
1274  u[2] = _mm_sub_epi16(p[1], p[2]);
1275  u[3] = _mm_sub_epi16(p[0], p[3]);
1276
1277  v[0] = _mm_unpacklo_epi16(u[0], u[1]);
1278  v[1] = _mm_unpackhi_epi16(u[0], u[1]);
1279  v[2] = _mm_unpacklo_epi16(u[2], u[3]);
1280  v[3] = _mm_unpackhi_epi16(u[2], u[3]);
1281
1282  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
1283  u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
1284  u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
1285  u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
1286  u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
1287  u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
1288  u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
1289  u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
1290
1291  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1292  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1293  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1294  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1295  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1296  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1297  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1298  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1299
1300  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1301  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1302  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1303  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1304  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1305  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1306  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1307  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1308
1309  in[0] = _mm_packs_epi32(u[0], u[1]);
1310  in[4] = _mm_packs_epi32(u[4], u[5]);
1311  in[8] = _mm_packs_epi32(u[2], u[3]);
1312  in[12] = _mm_packs_epi32(u[6], u[7]);
1313
1314  u[0] = _mm_unpacklo_epi16(p[5], p[6]);
1315  u[1] = _mm_unpackhi_epi16(p[5], p[6]);
1316  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
1317  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
1318  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
1319  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
1320
1321  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1322  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1323  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1324  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1325
1326  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1327  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1328  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1329  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1330
1331  u[0] = _mm_packs_epi32(v[0], v[1]);
1332  u[1] = _mm_packs_epi32(v[2], v[3]);
1333
1334  t[0] = _mm_add_epi16(p[4], u[0]);
1335  t[1] = _mm_sub_epi16(p[4], u[0]);
1336  t[2] = _mm_sub_epi16(p[7], u[1]);
1337  t[3] = _mm_add_epi16(p[7], u[1]);
1338
1339  u[0] = _mm_unpacklo_epi16(t[0], t[3]);
1340  u[1] = _mm_unpackhi_epi16(t[0], t[3]);
1341  u[2] = _mm_unpacklo_epi16(t[1], t[2]);
1342  u[3] = _mm_unpackhi_epi16(t[1], t[2]);
1343
1344  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
1345  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
1346  v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
1347  v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
1348  v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
1349  v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
1350  v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
1351  v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
1352
1353  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1354  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1355  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1356  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1357  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1358  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1359  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1360  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1361
1362  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1363  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1364  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1365  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1366  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1367  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1368  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1369  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1370
1371  in[2] = _mm_packs_epi32(v[0], v[1]);
1372  in[6] = _mm_packs_epi32(v[4], v[5]);
1373  in[10] = _mm_packs_epi32(v[2], v[3]);
1374  in[14] = _mm_packs_epi32(v[6], v[7]);
1375
1376  // stage 2
1377  u[0] = _mm_unpacklo_epi16(s[2], s[5]);
1378  u[1] = _mm_unpackhi_epi16(s[2], s[5]);
1379  u[2] = _mm_unpacklo_epi16(s[3], s[4]);
1380  u[3] = _mm_unpackhi_epi16(s[3], s[4]);
1381
1382  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
1383  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
1384  v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
1385  v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
1386  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
1387  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
1388  v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
1389  v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
1390
1391  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1392  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1393  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1394  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1395  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1396  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1397  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1398  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1399
1400  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1401  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1402  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1403  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1404  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1405  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1406  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1407  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1408
1409  t[2] = _mm_packs_epi32(v[0], v[1]);
1410  t[3] = _mm_packs_epi32(v[2], v[3]);
1411  t[4] = _mm_packs_epi32(v[4], v[5]);
1412  t[5] = _mm_packs_epi32(v[6], v[7]);
1413
1414  // stage 3
1415  p[0] = _mm_add_epi16(s[0], t[3]);
1416  p[1] = _mm_add_epi16(s[1], t[2]);
1417  p[2] = _mm_sub_epi16(s[1], t[2]);
1418  p[3] = _mm_sub_epi16(s[0], t[3]);
1419  p[4] = _mm_sub_epi16(s[7], t[4]);
1420  p[5] = _mm_sub_epi16(s[6], t[5]);
1421  p[6] = _mm_add_epi16(s[6], t[5]);
1422  p[7] = _mm_add_epi16(s[7], t[4]);
1423
1424  // stage 4
1425  u[0] = _mm_unpacklo_epi16(p[1], p[6]);
1426  u[1] = _mm_unpackhi_epi16(p[1], p[6]);
1427  u[2] = _mm_unpacklo_epi16(p[2], p[5]);
1428  u[3] = _mm_unpackhi_epi16(p[2], p[5]);
1429
1430  v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
1431  v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
1432  v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08);
1433  v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08);
1434  v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24);
1435  v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24);
1436  v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
1437  v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
1438
1439  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1440  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1441  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1442  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1443  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1444  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1445  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1446  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1447
1448  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1449  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1450  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1451  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1452  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1453  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1454  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1455  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1456
1457  t[1] = _mm_packs_epi32(v[0], v[1]);
1458  t[2] = _mm_packs_epi32(v[2], v[3]);
1459  t[5] = _mm_packs_epi32(v[4], v[5]);
1460  t[6] = _mm_packs_epi32(v[6], v[7]);
1461
1462  // stage 5
1463  s[0] = _mm_add_epi16(p[0], t[1]);
1464  s[1] = _mm_sub_epi16(p[0], t[1]);
1465  s[2] = _mm_add_epi16(p[3], t[2]);
1466  s[3] = _mm_sub_epi16(p[3], t[2]);
1467  s[4] = _mm_sub_epi16(p[4], t[5]);
1468  s[5] = _mm_add_epi16(p[4], t[5]);
1469  s[6] = _mm_sub_epi16(p[7], t[6]);
1470  s[7] = _mm_add_epi16(p[7], t[6]);
1471
1472  // stage 6
1473  u[0] = _mm_unpacklo_epi16(s[0], s[7]);
1474  u[1] = _mm_unpackhi_epi16(s[0], s[7]);
1475  u[2] = _mm_unpacklo_epi16(s[1], s[6]);
1476  u[3] = _mm_unpackhi_epi16(s[1], s[6]);
1477  u[4] = _mm_unpacklo_epi16(s[2], s[5]);
1478  u[5] = _mm_unpackhi_epi16(s[2], s[5]);
1479  u[6] = _mm_unpacklo_epi16(s[3], s[4]);
1480  u[7] = _mm_unpackhi_epi16(s[3], s[4]);
1481
1482  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
1483  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
1484  v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
1485  v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
1486  v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
1487  v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
1488  v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
1489  v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
1490  v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
1491  v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
1492  v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
1493  v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
1494  v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
1495  v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
1496  v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
1497  v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
1498
1499  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1500  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1501  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1502  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1503  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1504  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1505  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1506  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1507  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1508  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1509  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1510  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1511  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1512  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1513  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1514  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1515
1516  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1517  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1518  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1519  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1520  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1521  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1522  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1523  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1524  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1525  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1526  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1527  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1528  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1529  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1530  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1531  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1532
1533  in[1]  = _mm_packs_epi32(v[0], v[1]);
1534  in[9]  = _mm_packs_epi32(v[2], v[3]);
1535  in[5]  = _mm_packs_epi32(v[4], v[5]);
1536  in[13] = _mm_packs_epi32(v[6], v[7]);
1537  in[3]  = _mm_packs_epi32(v[8], v[9]);
1538  in[11] = _mm_packs_epi32(v[10], v[11]);
1539  in[7]  = _mm_packs_epi32(v[12], v[13]);
1540  in[15] = _mm_packs_epi32(v[14], v[15]);
1541}
1542
1543static void fadst16_8col(__m128i *in) {
1544  // perform 16x16 1-D ADST for 8 columns
1545  __m128i s[16], x[16], u[32], v[32];
1546  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
1547  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
1548  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
1549  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
1550  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
1551  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
1552  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
1553  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
1554  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
1555  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
1556  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
1557  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
1558  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
1559  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
1560  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
1561  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
1562  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1563  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1564  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1565  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1566  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
1567  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
1568  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1569  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1570  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
1571  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
1572  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
1573  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1574  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1575  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1576  const __m128i kZero = _mm_set1_epi16(0);
1577
1578  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
1579  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
1580  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
1581  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
1582  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
1583  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
1584  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
1585  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
1586  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
1587  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
1588  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
1589  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
1590  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
1591  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
1592  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
1593  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
1594
1595  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
1596  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
1597  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
1598  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
1599  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
1600  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
1601  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
1602  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
1603  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
1604  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
1605  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
1606  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
1607  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
1608  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
1609  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
1610  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
1611  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
1612  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
1613  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
1614  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
1615  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
1616  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
1617  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
1618  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
1619  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
1620  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
1621  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
1622  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
1623  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
1624  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
1625  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
1626  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
1627
1628  u[0] = _mm_add_epi32(v[0], v[16]);
1629  u[1] = _mm_add_epi32(v[1], v[17]);
1630  u[2] = _mm_add_epi32(v[2], v[18]);
1631  u[3] = _mm_add_epi32(v[3], v[19]);
1632  u[4] = _mm_add_epi32(v[4], v[20]);
1633  u[5] = _mm_add_epi32(v[5], v[21]);
1634  u[6] = _mm_add_epi32(v[6], v[22]);
1635  u[7] = _mm_add_epi32(v[7], v[23]);
1636  u[8] = _mm_add_epi32(v[8], v[24]);
1637  u[9] = _mm_add_epi32(v[9], v[25]);
1638  u[10] = _mm_add_epi32(v[10], v[26]);
1639  u[11] = _mm_add_epi32(v[11], v[27]);
1640  u[12] = _mm_add_epi32(v[12], v[28]);
1641  u[13] = _mm_add_epi32(v[13], v[29]);
1642  u[14] = _mm_add_epi32(v[14], v[30]);
1643  u[15] = _mm_add_epi32(v[15], v[31]);
1644  u[16] = _mm_sub_epi32(v[0], v[16]);
1645  u[17] = _mm_sub_epi32(v[1], v[17]);
1646  u[18] = _mm_sub_epi32(v[2], v[18]);
1647  u[19] = _mm_sub_epi32(v[3], v[19]);
1648  u[20] = _mm_sub_epi32(v[4], v[20]);
1649  u[21] = _mm_sub_epi32(v[5], v[21]);
1650  u[22] = _mm_sub_epi32(v[6], v[22]);
1651  u[23] = _mm_sub_epi32(v[7], v[23]);
1652  u[24] = _mm_sub_epi32(v[8], v[24]);
1653  u[25] = _mm_sub_epi32(v[9], v[25]);
1654  u[26] = _mm_sub_epi32(v[10], v[26]);
1655  u[27] = _mm_sub_epi32(v[11], v[27]);
1656  u[28] = _mm_sub_epi32(v[12], v[28]);
1657  u[29] = _mm_sub_epi32(v[13], v[29]);
1658  u[30] = _mm_sub_epi32(v[14], v[30]);
1659  u[31] = _mm_sub_epi32(v[15], v[31]);
1660
1661  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1662  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1663  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1664  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1665  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1666  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1667  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1668  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1669  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1670  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1671  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1672  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1673  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1674  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1675  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1676  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1677  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
1678  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
1679  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
1680  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
1681  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
1682  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
1683  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
1684  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
1685  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
1686  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
1687  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
1688  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
1689  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
1690  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
1691  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
1692  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
1693
1694  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1695  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1696  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1697  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1698  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1699  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1700  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1701  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1702  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1703  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1704  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1705  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1706  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1707  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1708  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1709  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1710  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
1711  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
1712  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
1713  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
1714  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
1715  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
1716  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
1717  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
1718  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
1719  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
1720  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
1721  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
1722  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
1723  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
1724  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
1725  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
1726
1727  s[0] = _mm_packs_epi32(u[0], u[1]);
1728  s[1] = _mm_packs_epi32(u[2], u[3]);
1729  s[2] = _mm_packs_epi32(u[4], u[5]);
1730  s[3] = _mm_packs_epi32(u[6], u[7]);
1731  s[4] = _mm_packs_epi32(u[8], u[9]);
1732  s[5] = _mm_packs_epi32(u[10], u[11]);
1733  s[6] = _mm_packs_epi32(u[12], u[13]);
1734  s[7] = _mm_packs_epi32(u[14], u[15]);
1735  s[8] = _mm_packs_epi32(u[16], u[17]);
1736  s[9] = _mm_packs_epi32(u[18], u[19]);
1737  s[10] = _mm_packs_epi32(u[20], u[21]);
1738  s[11] = _mm_packs_epi32(u[22], u[23]);
1739  s[12] = _mm_packs_epi32(u[24], u[25]);
1740  s[13] = _mm_packs_epi32(u[26], u[27]);
1741  s[14] = _mm_packs_epi32(u[28], u[29]);
1742  s[15] = _mm_packs_epi32(u[30], u[31]);
1743
1744  // stage 2
1745  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
1746  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
1747  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
1748  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
1749  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
1750  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
1751  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1752  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1753
1754  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1755  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1756  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1757  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1758  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1759  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1760  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1761  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1762  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
1763  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
1764  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
1765  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
1766  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
1767  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
1768  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
1769  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
1770
1771  u[0] = _mm_add_epi32(v[0], v[8]);
1772  u[1] = _mm_add_epi32(v[1], v[9]);
1773  u[2] = _mm_add_epi32(v[2], v[10]);
1774  u[3] = _mm_add_epi32(v[3], v[11]);
1775  u[4] = _mm_add_epi32(v[4], v[12]);
1776  u[5] = _mm_add_epi32(v[5], v[13]);
1777  u[6] = _mm_add_epi32(v[6], v[14]);
1778  u[7] = _mm_add_epi32(v[7], v[15]);
1779  u[8] = _mm_sub_epi32(v[0], v[8]);
1780  u[9] = _mm_sub_epi32(v[1], v[9]);
1781  u[10] = _mm_sub_epi32(v[2], v[10]);
1782  u[11] = _mm_sub_epi32(v[3], v[11]);
1783  u[12] = _mm_sub_epi32(v[4], v[12]);
1784  u[13] = _mm_sub_epi32(v[5], v[13]);
1785  u[14] = _mm_sub_epi32(v[6], v[14]);
1786  u[15] = _mm_sub_epi32(v[7], v[15]);
1787
1788  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1789  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1790  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1791  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1792  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1793  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1794  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1795  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1796  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1797  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1798  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1799  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1800  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1801  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1802  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1803  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1804
1805  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1806  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1807  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1808  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1809  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1810  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1811  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1812  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1813  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1814  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1815  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1816  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1817  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1818  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1819  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1820  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1821
1822  x[0] = _mm_add_epi16(s[0], s[4]);
1823  x[1] = _mm_add_epi16(s[1], s[5]);
1824  x[2] = _mm_add_epi16(s[2], s[6]);
1825  x[3] = _mm_add_epi16(s[3], s[7]);
1826  x[4] = _mm_sub_epi16(s[0], s[4]);
1827  x[5] = _mm_sub_epi16(s[1], s[5]);
1828  x[6] = _mm_sub_epi16(s[2], s[6]);
1829  x[7] = _mm_sub_epi16(s[3], s[7]);
1830  x[8] = _mm_packs_epi32(u[0], u[1]);
1831  x[9] = _mm_packs_epi32(u[2], u[3]);
1832  x[10] = _mm_packs_epi32(u[4], u[5]);
1833  x[11] = _mm_packs_epi32(u[6], u[7]);
1834  x[12] = _mm_packs_epi32(u[8], u[9]);
1835  x[13] = _mm_packs_epi32(u[10], u[11]);
1836  x[14] = _mm_packs_epi32(u[12], u[13]);
1837  x[15] = _mm_packs_epi32(u[14], u[15]);
1838
1839  // stage 3
1840  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
1841  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
1842  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
1843  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
1844  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
1845  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
1846  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
1847  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
1848
1849  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
1850  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
1851  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
1852  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
1853  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
1854  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
1855  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1856  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1857  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
1858  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
1859  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
1860  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
1861  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
1862  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
1863  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
1864  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
1865
1866  u[0] = _mm_add_epi32(v[0], v[4]);
1867  u[1] = _mm_add_epi32(v[1], v[5]);
1868  u[2] = _mm_add_epi32(v[2], v[6]);
1869  u[3] = _mm_add_epi32(v[3], v[7]);
1870  u[4] = _mm_sub_epi32(v[0], v[4]);
1871  u[5] = _mm_sub_epi32(v[1], v[5]);
1872  u[6] = _mm_sub_epi32(v[2], v[6]);
1873  u[7] = _mm_sub_epi32(v[3], v[7]);
1874  u[8] = _mm_add_epi32(v[8], v[12]);
1875  u[9] = _mm_add_epi32(v[9], v[13]);
1876  u[10] = _mm_add_epi32(v[10], v[14]);
1877  u[11] = _mm_add_epi32(v[11], v[15]);
1878  u[12] = _mm_sub_epi32(v[8], v[12]);
1879  u[13] = _mm_sub_epi32(v[9], v[13]);
1880  u[14] = _mm_sub_epi32(v[10], v[14]);
1881  u[15] = _mm_sub_epi32(v[11], v[15]);
1882
1883  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1884  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1885  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1886  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1887  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1888  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1889  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1890  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1891  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1892  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1893  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1894  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1895  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1896  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1897  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1898  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1899
1900  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1901  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1902  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1903  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1904  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1905  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1906  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1907  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1908  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1909  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1910  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1911  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1912  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1913  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1914  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1915  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1916
1917  s[0] = _mm_add_epi16(x[0], x[2]);
1918  s[1] = _mm_add_epi16(x[1], x[3]);
1919  s[2] = _mm_sub_epi16(x[0], x[2]);
1920  s[3] = _mm_sub_epi16(x[1], x[3]);
1921  s[4] = _mm_packs_epi32(v[0], v[1]);
1922  s[5] = _mm_packs_epi32(v[2], v[3]);
1923  s[6] = _mm_packs_epi32(v[4], v[5]);
1924  s[7] = _mm_packs_epi32(v[6], v[7]);
1925  s[8] = _mm_add_epi16(x[8], x[10]);
1926  s[9] = _mm_add_epi16(x[9], x[11]);
1927  s[10] = _mm_sub_epi16(x[8], x[10]);
1928  s[11] = _mm_sub_epi16(x[9], x[11]);
1929  s[12] = _mm_packs_epi32(v[8], v[9]);
1930  s[13] = _mm_packs_epi32(v[10], v[11]);
1931  s[14] = _mm_packs_epi32(v[12], v[13]);
1932  s[15] = _mm_packs_epi32(v[14], v[15]);
1933
1934  // stage 4
1935  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
1936  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
1937  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
1938  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
1939  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
1940  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
1941  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1942  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1943
1944  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
1945  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
1946  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
1947  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
1948  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
1949  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
1950  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
1951  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
1952  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
1953  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
1954  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
1955  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
1956  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
1957  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
1958  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
1959  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
1960
1961  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1962  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1963  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1964  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1965  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1966  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1967  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1968  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1969  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1970  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1971  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1972  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1973  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1974  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1975  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1976  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1977
1978  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1979  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1980  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1981  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1982  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1983  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1984  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1985  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1986  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1987  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1988  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1989  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1990  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1991  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1992  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1993  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1994
1995  in[0] = s[0];
1996  in[1] = _mm_sub_epi16(kZero, s[8]);
1997  in[2] = s[12];
1998  in[3] = _mm_sub_epi16(kZero, s[4]);
1999  in[4] = _mm_packs_epi32(v[4], v[5]);
2000  in[5] = _mm_packs_epi32(v[12], v[13]);
2001  in[6] = _mm_packs_epi32(v[8], v[9]);
2002  in[7] = _mm_packs_epi32(v[0], v[1]);
2003  in[8] = _mm_packs_epi32(v[2], v[3]);
2004  in[9] = _mm_packs_epi32(v[10], v[11]);
2005  in[10] = _mm_packs_epi32(v[14], v[15]);
2006  in[11] = _mm_packs_epi32(v[6], v[7]);
2007  in[12] = s[5];
2008  in[13] = _mm_sub_epi16(kZero, s[13]);
2009  in[14] = s[9];
2010  in[15] = _mm_sub_epi16(kZero, s[1]);
2011}
2012
2013static void fdct16_sse2(__m128i *in0, __m128i *in1) {
2014  fdct16_8col(in0);
2015  fdct16_8col(in1);
2016  array_transpose_16x16(in0, in1);
2017}
2018
2019static void fadst16_sse2(__m128i *in0, __m128i *in1) {
2020  fadst16_8col(in0);
2021  fadst16_8col(in1);
2022  array_transpose_16x16(in0, in1);
2023}
2024
2025void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output,
2026                       int stride, int tx_type) {
2027  __m128i in0[16], in1[16];
2028
2029  switch (tx_type) {
2030    case DCT_DCT:
2031      vpx_fdct16x16_sse2(input, output, stride);
2032      break;
2033    case ADST_DCT:
2034      load_buffer_16x16(input, in0, in1, stride);
2035      fadst16_sse2(in0, in1);
2036      right_shift_16x16(in0, in1);
2037      fdct16_sse2(in0, in1);
2038      write_buffer_16x16(output, in0, in1, 16);
2039      break;
2040    case DCT_ADST:
2041      load_buffer_16x16(input, in0, in1, stride);
2042      fdct16_sse2(in0, in1);
2043      right_shift_16x16(in0, in1);
2044      fadst16_sse2(in0, in1);
2045      write_buffer_16x16(output, in0, in1, 16);
2046      break;
2047    case ADST_ADST:
2048      load_buffer_16x16(input, in0, in1, stride);
2049      fadst16_sse2(in0, in1);
2050      right_shift_16x16(in0, in1);
2051      fadst16_sse2(in0, in1);
2052      write_buffer_16x16(output, in0, in1, 16);
2053      break;
2054    default:
2055      assert(0);
2056      break;
2057  }
2058}
2059