1ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang/*
2ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *
4ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  Use of this source code is governed by a BSD-style license
5ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  that can be found in the LICENSE file in the root of the source
6ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  tree. An additional intellectual property rights grant can be found
7ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  in the file PATENTS.  All contributing project authors may
8ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  be found in the AUTHORS file in the root of the source tree.
9ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang */
10ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
11ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include <assert.h>
12ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include <emmintrin.h>  // SSE2
13ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "./vpx_config.h"
14ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vpx/vpx_integer.h"
15ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp9/common/vp9_common.h"
16ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp9/common/vp9_idct.h"
17ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
189b35249446b07f40ac5fcc3205f2c048616efacchkuang#define RECON_AND_STORE4X4(dest, in_x) \
199b35249446b07f40ac5fcc3205f2c048616efacchkuang{                                                     \
209b35249446b07f40ac5fcc3205f2c048616efacchkuang  __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
219b35249446b07f40ac5fcc3205f2c048616efacchkuang  d0 = _mm_unpacklo_epi8(d0, zero); \
229b35249446b07f40ac5fcc3205f2c048616efacchkuang  d0 = _mm_add_epi16(in_x, d0); \
239b35249446b07f40ac5fcc3205f2c048616efacchkuang  d0 = _mm_packus_epi16(d0, d0); \
249b35249446b07f40ac5fcc3205f2c048616efacchkuang  *(int *)dest = _mm_cvtsi128_si32(d0); \
259b35249446b07f40ac5fcc3205f2c048616efacchkuang  dest += stride; \
269b35249446b07f40ac5fcc3205f2c048616efacchkuang}
279b35249446b07f40ac5fcc3205f2c048616efacchkuang
285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
29ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i zero = _mm_setzero_si128();
30ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i eight = _mm_set1_epi16(8);
31ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
32ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                    (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
33ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                    (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
34ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                    (int16_t)cospi_8_64, (int16_t)cospi_24_64);
35ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
36ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i input0, input1, input2, input3;
37ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
38ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Rows
399b35249446b07f40ac5fcc3205f2c048616efacchkuang  input0 = _mm_load_si128((const __m128i *)input);
409b35249446b07f40ac5fcc3205f2c048616efacchkuang  input2 = _mm_load_si128((const __m128i *)(input + 8));
41ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
42ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Construct i3, i1, i3, i1, i2, i0, i2, i0
43ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_shufflelo_epi16(input0, 0xd8);
449b35249446b07f40ac5fcc3205f2c048616efacchkuang  input0 = _mm_shufflehi_epi16(input0, 0xd8);
45ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_shufflelo_epi16(input2, 0xd8);
469b35249446b07f40ac5fcc3205f2c048616efacchkuang  input2 = _mm_shufflehi_epi16(input2, 0xd8);
47ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
489b35249446b07f40ac5fcc3205f2c048616efacchkuang  input1 = _mm_unpackhi_epi32(input0, input0);
49ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_unpacklo_epi32(input0, input0);
509b35249446b07f40ac5fcc3205f2c048616efacchkuang  input3 = _mm_unpackhi_epi32(input2, input2);
51ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_unpacklo_epi32(input2, input2);
52ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
53ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 1
54ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_madd_epi16(input0, cst);
55ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_madd_epi16(input1, cst);
56ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_madd_epi16(input2, cst);
57ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_madd_epi16(input3, cst);
58ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
59ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_add_epi32(input0, rounding);
60ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_add_epi32(input1, rounding);
61ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_add_epi32(input2, rounding);
62ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_add_epi32(input3, rounding);
63ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
64ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
65ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
66ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
67ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
68ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
69ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 2
709b35249446b07f40ac5fcc3205f2c048616efacchkuang  input0 = _mm_packs_epi32(input0, input1);
719b35249446b07f40ac5fcc3205f2c048616efacchkuang  input1 = _mm_packs_epi32(input2, input3);
72ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
73ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Transpose
749b35249446b07f40ac5fcc3205f2c048616efacchkuang  input2 = _mm_unpacklo_epi16(input0, input1);
759b35249446b07f40ac5fcc3205f2c048616efacchkuang  input3 = _mm_unpackhi_epi16(input0, input1);
769b35249446b07f40ac5fcc3205f2c048616efacchkuang  input0 = _mm_unpacklo_epi32(input2, input3);
779b35249446b07f40ac5fcc3205f2c048616efacchkuang  input1 = _mm_unpackhi_epi32(input2, input3);
78ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
79ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Switch column2, column 3, and then, we got:
80ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // input2: column1, column 0;  input3: column2, column 3.
81ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_shuffle_epi32(input1, 0x4e);
82ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_add_epi16(input0, input1);
83ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_sub_epi16(input0, input1);
84ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
85ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Columns
86ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Construct i3, i1, i3, i1, i2, i0, i2, i0
879b35249446b07f40ac5fcc3205f2c048616efacchkuang  input0 = _mm_unpacklo_epi32(input2, input2);
889b35249446b07f40ac5fcc3205f2c048616efacchkuang  input1 = _mm_unpackhi_epi32(input2, input2);
899b35249446b07f40ac5fcc3205f2c048616efacchkuang  input2 = _mm_unpackhi_epi32(input3, input3);
90ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_unpacklo_epi32(input3, input3);
91ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
92ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 1
93ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_madd_epi16(input0, cst);
94ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_madd_epi16(input1, cst);
95ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_madd_epi16(input2, cst);
96ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_madd_epi16(input3, cst);
97ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
98ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_add_epi32(input0, rounding);
99ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_add_epi32(input1, rounding);
100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_add_epi32(input2, rounding);
101ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_add_epi32(input3, rounding);
102ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
103ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
105ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
106ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 2
1099b35249446b07f40ac5fcc3205f2c048616efacchkuang  input0 = _mm_packs_epi32(input0, input2);
1109b35249446b07f40ac5fcc3205f2c048616efacchkuang  input1 = _mm_packs_epi32(input1, input3);
111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Transpose
1139b35249446b07f40ac5fcc3205f2c048616efacchkuang  input2 = _mm_unpacklo_epi16(input0, input1);
1149b35249446b07f40ac5fcc3205f2c048616efacchkuang  input3 = _mm_unpackhi_epi16(input0, input1);
1159b35249446b07f40ac5fcc3205f2c048616efacchkuang  input0 = _mm_unpacklo_epi32(input2, input3);
1169b35249446b07f40ac5fcc3205f2c048616efacchkuang  input1 = _mm_unpackhi_epi32(input2, input3);
117ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
118ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Switch column2, column 3, and then, we got:
119ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // input2: column1, column 0;  input3: column2, column 3.
120ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_shuffle_epi32(input1, 0x4e);
121ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_add_epi16(input0, input1);
122ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_sub_epi16(input0, input1);
123ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
124ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Final round and shift
125ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_add_epi16(input2, eight);
126ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_add_epi16(input3, eight);
127ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
128ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_srai_epi16(input2, 4);
129ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_srai_epi16(input3, 4);
130ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1319b35249446b07f40ac5fcc3205f2c048616efacchkuang  // Reconstruction and Store
1329b35249446b07f40ac5fcc3205f2c048616efacchkuang  {
1339b35249446b07f40ac5fcc3205f2c048616efacchkuang     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
1349b35249446b07f40ac5fcc3205f2c048616efacchkuang     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
1359b35249446b07f40ac5fcc3205f2c048616efacchkuang     d0 = _mm_unpacklo_epi32(d0,
1369b35249446b07f40ac5fcc3205f2c048616efacchkuang          _mm_cvtsi32_si128(*(const int *) (dest + stride)));
1379b35249446b07f40ac5fcc3205f2c048616efacchkuang     d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(
1389b35249446b07f40ac5fcc3205f2c048616efacchkuang                    *(const int *) (dest + stride * 3)), d2);
1399b35249446b07f40ac5fcc3205f2c048616efacchkuang     d0 = _mm_unpacklo_epi8(d0, zero);
1409b35249446b07f40ac5fcc3205f2c048616efacchkuang     d2 = _mm_unpacklo_epi8(d2, zero);
1419b35249446b07f40ac5fcc3205f2c048616efacchkuang     d0 = _mm_add_epi16(d0, input2);
1429b35249446b07f40ac5fcc3205f2c048616efacchkuang     d2 = _mm_add_epi16(d2, input3);
1439b35249446b07f40ac5fcc3205f2c048616efacchkuang     d0 = _mm_packus_epi16(d0, d2);
1449b35249446b07f40ac5fcc3205f2c048616efacchkuang     // store input0
1459b35249446b07f40ac5fcc3205f2c048616efacchkuang     *(int *)dest = _mm_cvtsi128_si32(d0);
1469b35249446b07f40ac5fcc3205f2c048616efacchkuang     // store input1
1479b35249446b07f40ac5fcc3205f2c048616efacchkuang     d0 = _mm_srli_si128(d0, 4);
1489b35249446b07f40ac5fcc3205f2c048616efacchkuang     *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
1499b35249446b07f40ac5fcc3205f2c048616efacchkuang     // store input2
1509b35249446b07f40ac5fcc3205f2c048616efacchkuang     d0 = _mm_srli_si128(d0, 4);
1519b35249446b07f40ac5fcc3205f2c048616efacchkuang     *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
1529b35249446b07f40ac5fcc3205f2c048616efacchkuang     // store input3
1539b35249446b07f40ac5fcc3205f2c048616efacchkuang     d0 = _mm_srli_si128(d0, 4);
1549b35249446b07f40ac5fcc3205f2c048616efacchkuang     *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
155ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
156ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
157ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
15991037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i dc_value;
16091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i zero = _mm_setzero_si128();
16191037db265ecdd914a26e056cf69207b4f50924ehkuang  int a;
16291037db265ecdd914a26e056cf69207b4f50924ehkuang
16391037db265ecdd914a26e056cf69207b4f50924ehkuang  a = dct_const_round_shift(input[0] * cospi_16_64);
16491037db265ecdd914a26e056cf69207b4f50924ehkuang  a = dct_const_round_shift(a * cospi_16_64);
16591037db265ecdd914a26e056cf69207b4f50924ehkuang  a = ROUND_POWER_OF_TWO(a, 4);
16691037db265ecdd914a26e056cf69207b4f50924ehkuang
16791037db265ecdd914a26e056cf69207b4f50924ehkuang  dc_value = _mm_set1_epi16(a);
16891037db265ecdd914a26e056cf69207b4f50924ehkuang
16991037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, dc_value);
17091037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, dc_value);
17191037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, dc_value);
17291037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, dc_value);
17391037db265ecdd914a26e056cf69207b4f50924ehkuang}
17491037db265ecdd914a26e056cf69207b4f50924ehkuang
17591037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void transpose_4x4(__m128i *res) {
17691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
177b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
17891037db265ecdd914a26e056cf69207b4f50924ehkuang
179b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
180b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
18191037db265ecdd914a26e056cf69207b4f50924ehkuang}
18291037db265ecdd914a26e056cf69207b4f50924ehkuang
183b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void idct4_sse2(__m128i *in) {
18491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
18591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
18691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
18791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
18891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
18991037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i u[8], v[8];
19091037db265ecdd914a26e056cf69207b4f50924ehkuang
19191037db265ecdd914a26e056cf69207b4f50924ehkuang  transpose_4x4(in);
19291037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 1
193b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
194b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
19591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
19691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
19791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
19891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
19991037db265ecdd914a26e056cf69207b4f50924ehkuang
20091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
20191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
20291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
20391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
20491037db265ecdd914a26e056cf69207b4f50924ehkuang
20591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
20691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
20791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
20891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
20991037db265ecdd914a26e056cf69207b4f50924ehkuang
210b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[0] = _mm_packs_epi32(v[0], v[1]);
211b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[1] = _mm_packs_epi32(v[3], v[2]);
21291037db265ecdd914a26e056cf69207b4f50924ehkuang
21391037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 2
214b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[0] = _mm_add_epi16(u[0], u[1]);
215b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[1] = _mm_sub_epi16(u[0], u[1]);
216b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[1] = _mm_shuffle_epi32(in[1], 0x4E);
21791037db265ecdd914a26e056cf69207b4f50924ehkuang}
21891037db265ecdd914a26e056cf69207b4f50924ehkuang
219b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void iadst4_sse2(__m128i *in) {
22091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
22191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
22291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
22391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
22491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
22591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i kZero = _mm_set1_epi16(0);
22691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
22791037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i u[8], v[8], in7;
22891037db265ecdd914a26e056cf69207b4f50924ehkuang
22991037db265ecdd914a26e056cf69207b4f50924ehkuang  transpose_4x4(in);
230b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in7 = _mm_srli_si128(in[1], 8);
231b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in7 = _mm_add_epi16(in7, in[0]);
232b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in7 = _mm_sub_epi16(in7, in[1]);
23391037db265ecdd914a26e056cf69207b4f50924ehkuang
234b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
235b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
23691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(in7, kZero);
237b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[3] = _mm_unpackhi_epi16(in[0], kZero);
23891037db265ecdd914a26e056cf69207b4f50924ehkuang
23991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
24091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
24191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
24291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
24391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
24491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
24591037db265ecdd914a26e056cf69207b4f50924ehkuang
24691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], v[1]);
24791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[3], v[4]);
24891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = v[2];
24991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(u[0], u[1]);
25091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_slli_epi32(v[5], 2);
25191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(u[3], v[5]);
25291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_sub_epi32(u[5], u[4]);
25391037db265ecdd914a26e056cf69207b4f50924ehkuang
25491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
25591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
25691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
25791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
25891037db265ecdd914a26e056cf69207b4f50924ehkuang
25991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
26091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
26191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
26291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
26391037db265ecdd914a26e056cf69207b4f50924ehkuang
264b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[0] = _mm_packs_epi32(u[0], u[1]);
265b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[1] = _mm_packs_epi32(u[2], u[3]);
26691037db265ecdd914a26e056cf69207b4f50924ehkuang}
26791037db265ecdd914a26e056cf69207b4f50924ehkuang
2685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
2695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            int tx_type) {
270b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i in[2];
27191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i zero = _mm_setzero_si128();
27291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i eight = _mm_set1_epi16(8);
27391037db265ecdd914a26e056cf69207b4f50924ehkuang
274b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[0]= _mm_loadu_si128((const __m128i *)(input));
275b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[1]= _mm_loadu_si128((const __m128i *)(input + 8));
27691037db265ecdd914a26e056cf69207b4f50924ehkuang
27791037db265ecdd914a26e056cf69207b4f50924ehkuang  switch (tx_type) {
27891037db265ecdd914a26e056cf69207b4f50924ehkuang    case 0:  // DCT_DCT
279b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      idct4_sse2(in);
280b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      idct4_sse2(in);
28191037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
28291037db265ecdd914a26e056cf69207b4f50924ehkuang    case 1:  // ADST_DCT
283b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      idct4_sse2(in);
284b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      iadst4_sse2(in);
28591037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
28691037db265ecdd914a26e056cf69207b4f50924ehkuang    case 2:  // DCT_ADST
287b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      iadst4_sse2(in);
288b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      idct4_sse2(in);
28991037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
29091037db265ecdd914a26e056cf69207b4f50924ehkuang    case 3:  // ADST_ADST
291b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      iadst4_sse2(in);
292b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      iadst4_sse2(in);
29391037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
29491037db265ecdd914a26e056cf69207b4f50924ehkuang    default:
29591037db265ecdd914a26e056cf69207b4f50924ehkuang      assert(0);
29691037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
29791037db265ecdd914a26e056cf69207b4f50924ehkuang  }
29891037db265ecdd914a26e056cf69207b4f50924ehkuang
29991037db265ecdd914a26e056cf69207b4f50924ehkuang  // Final round and shift
30091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_add_epi16(in[0], eight);
30191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_add_epi16(in[1], eight);
30291037db265ecdd914a26e056cf69207b4f50924ehkuang
30391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_srai_epi16(in[0], 4);
30491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_srai_epi16(in[1], 4);
30591037db265ecdd914a26e056cf69207b4f50924ehkuang
306b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // Reconstruction and Store
307b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  {
308b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
309b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
310b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     d0 = _mm_unpacklo_epi32(d0,
311b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian          _mm_cvtsi32_si128(*(const int *) (dest + stride)));
312b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     d2 = _mm_unpacklo_epi32(d2, _mm_cvtsi32_si128(
313b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                    *(const int *) (dest + stride * 3)));
314b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     d0 = _mm_unpacklo_epi8(d0, zero);
315b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     d2 = _mm_unpacklo_epi8(d2, zero);
316b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     d0 = _mm_add_epi16(d0, in[0]);
317b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     d2 = _mm_add_epi16(d2, in[1]);
318b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     d0 = _mm_packus_epi16(d0, d2);
319b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     // store result[0]
320b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     *(int *)dest = _mm_cvtsi128_si32(d0);
321b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     // store result[1]
322b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     d0 = _mm_srli_si128(d0, 4);
323b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
324b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     // store result[2]
325b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     d0 = _mm_srli_si128(d0, 4);
326b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
327b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     // store result[3]
328b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     d0 = _mm_srli_si128(d0, 4);
329b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
330b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  }
33191037db265ecdd914a26e056cf69207b4f50924ehkuang}
33291037db265ecdd914a26e056cf69207b4f50924ehkuang
333ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
334ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                      out0, out1, out2, out3, out4, out5, out6, out7) \
335ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {                                                     \
336ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
337ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
338ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
339ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
340ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
341ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
342ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
343ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
344ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                                        \
345ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
346ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
347ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
348ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
349ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
350ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
351ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
352ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
353ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                                            \
354ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
355ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
356ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
357ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
358ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
359ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
360ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
361ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
362ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
363ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
364b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \
365b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                         out0, out1, out2, out3) \
366b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  {                                              \
367b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
368b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
369b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
370b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
371b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    \
372ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
373ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
374ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
375ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
376b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    \
377ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
378ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
379ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
380ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
381ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
382ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
383b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
384ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {                                                     \
385ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
386ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
387ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                                        \
388ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);  /* i1 i0 */  \
389ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);  /* i3 i2 */  \
390b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  }
391b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
392b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
393b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  {                                            \
394b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
395b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
396b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
397b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
398ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
399ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
400ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang// Define Macro for multiplying elements by constants and adding them together.
401ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
402ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                               cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
403ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {   \
404ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp0 = _mm_madd_epi16(lo_0, cst0); \
405ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp1 = _mm_madd_epi16(hi_0, cst0); \
406ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp2 = _mm_madd_epi16(lo_0, cst1); \
407ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp3 = _mm_madd_epi16(hi_0, cst1); \
408ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp4 = _mm_madd_epi16(lo_1, cst2); \
409ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp5 = _mm_madd_epi16(hi_1, cst2); \
410ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp6 = _mm_madd_epi16(lo_1, cst3); \
411ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp7 = _mm_madd_epi16(hi_1, cst3); \
412ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      \
413ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp0 = _mm_add_epi32(tmp0, rounding); \
414ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp1 = _mm_add_epi32(tmp1, rounding); \
415ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp2 = _mm_add_epi32(tmp2, rounding); \
416ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp3 = _mm_add_epi32(tmp3, rounding); \
417ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp4 = _mm_add_epi32(tmp4, rounding); \
418ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp5 = _mm_add_epi32(tmp5, rounding); \
419ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp6 = _mm_add_epi32(tmp6, rounding); \
420ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp7 = _mm_add_epi32(tmp7, rounding); \
421ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      \
422ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
423ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
424ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
425ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
426ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
427ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
428ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
429ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
430ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      \
431ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      res0 = _mm_packs_epi32(tmp0, tmp1); \
432ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      res1 = _mm_packs_epi32(tmp2, tmp3); \
433ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      res2 = _mm_packs_epi32(tmp4, tmp5); \
434ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      res3 = _mm_packs_epi32(tmp6, tmp7); \
435ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
436ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
437b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
438b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  {   \
439b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp0 = _mm_madd_epi16(lo_0, cst0); \
440b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp1 = _mm_madd_epi16(hi_0, cst0); \
441b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp2 = _mm_madd_epi16(lo_0, cst1); \
442b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp3 = _mm_madd_epi16(hi_0, cst1); \
443b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      \
444b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp0 = _mm_add_epi32(tmp0, rounding); \
445b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp1 = _mm_add_epi32(tmp1, rounding); \
446b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp2 = _mm_add_epi32(tmp2, rounding); \
447b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp3 = _mm_add_epi32(tmp3, rounding); \
448b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      \
449b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
450b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
451b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
452b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
453b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      \
454b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      res0 = _mm_packs_epi32(tmp0, tmp1); \
455b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      res1 = _mm_packs_epi32(tmp2, tmp3); \
456b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  }
457b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
458b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
459b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                 out0, out1, out2, out3, out4, out5, out6, out7)  \
460b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  { \
461ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage1 */      \
462ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
463ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
464ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
465ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
466ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
467ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
468ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
469ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                          stg1_1, stg1_2, stg1_3, stp1_4,      \
470ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                          stp1_7, stp1_5, stp1_6)              \
471ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  } \
472ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
473ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage2 */ \
474ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
475ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
476ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
477ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
478ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
479ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
480ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
481ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stg2_1, stg2_2, stg2_3, stp2_0,     \
482ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stp2_1, stp2_2, stp2_3)             \
483ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
484ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
485ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
486ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
487ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
488ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  } \
489ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
490ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage3 */ \
491ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
492ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
493ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
494ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
495ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
496ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
497ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
498ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
499ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
500ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
501ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
502ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
503ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
504ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
505ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding); \
506ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_add_epi32(tmp1, rounding); \
507ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding); \
508ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_add_epi32(tmp3, rounding); \
509ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
510ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
511ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
512ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
513ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
514ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
515ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
516ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
517ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  } \
518ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  \
519ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage4  */ \
520b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  out0 = _mm_adds_epi16(stp1_0, stp2_7); \
521b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  out1 = _mm_adds_epi16(stp1_1, stp1_6); \
522b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  out2 = _mm_adds_epi16(stp1_2, stp1_5); \
523b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  out3 = _mm_adds_epi16(stp1_3, stp2_4); \
524b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  out4 = _mm_subs_epi16(stp1_3, stp2_4); \
525b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  out5 = _mm_subs_epi16(stp1_2, stp1_5); \
526b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  out6 = _mm_subs_epi16(stp1_1, stp1_6); \
527b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  out7 = _mm_subs_epi16(stp1_0, stp2_7); \
528b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  }
529ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
530ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define RECON_AND_STORE(dest, in_x) \
531ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {                                                     \
532ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
533ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      d0 = _mm_unpacklo_epi8(d0, zero); \
534f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      d0 = _mm_add_epi16(in_x, d0); \
535f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      d0 = _mm_packus_epi16(d0, d0); \
536f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      _mm_storel_epi64((__m128i *)(dest), d0); \
537ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      dest += stride; \
538ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
539ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
5405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
541ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i zero = _mm_setzero_si128();
542ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
543ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i final_rounding = _mm_set1_epi16(1<<4);
544ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
545ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
546ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
547ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
548ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
549ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
550ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
551ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
552ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
553ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
554ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
555ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
556ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
557ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int i;
558ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
559ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Load input data.
5605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in0 = _mm_load_si128((const __m128i *)input);
5615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
5625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
5635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
5645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
5655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
5665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
5675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
568ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
569ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // 2-D
570ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 2; i++) {
5715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
572b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,
573b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                  in0, in1, in2, in3, in4, in5, in6, in7);
574ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
575ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // 4-stage 1D idct8x8
576b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
577b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian             in0, in1, in2, in3, in4, in5, in6, in7);
578ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
579ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
580ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Final rounding and shift
581ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in0 = _mm_adds_epi16(in0, final_rounding);
582ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in1 = _mm_adds_epi16(in1, final_rounding);
583ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in2 = _mm_adds_epi16(in2, final_rounding);
584ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in3 = _mm_adds_epi16(in3, final_rounding);
585ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in4 = _mm_adds_epi16(in4, final_rounding);
586ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in5 = _mm_adds_epi16(in5, final_rounding);
587ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in6 = _mm_adds_epi16(in6, final_rounding);
588ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in7 = _mm_adds_epi16(in7, final_rounding);
589ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
590ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in0 = _mm_srai_epi16(in0, 5);
591ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in1 = _mm_srai_epi16(in1, 5);
592ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in2 = _mm_srai_epi16(in2, 5);
593ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in3 = _mm_srai_epi16(in3, 5);
594ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in4 = _mm_srai_epi16(in4, 5);
595ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in5 = _mm_srai_epi16(in5, 5);
596ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in6 = _mm_srai_epi16(in6, 5);
597ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in7 = _mm_srai_epi16(in7, 5);
598ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
599ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in0);
600ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in1);
601ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in2);
602ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in3);
603ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in4);
604ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in5);
605ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in6);
606ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in7);
607ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
608ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
6095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
610f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  __m128i dc_value;
611f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  const __m128i zero = _mm_setzero_si128();
612f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  int a;
613f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
614f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  a = dct_const_round_shift(input[0] * cospi_16_64);
615f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  a = dct_const_round_shift(a * cospi_16_64);
616f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  a = ROUND_POWER_OF_TWO(a, 5);
617f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
618f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  dc_value = _mm_set1_epi16(a);
619f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
620f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
621f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
622f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
623f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
624f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
625f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
626f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
627f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
628f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang}
629f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
63091037db265ecdd914a26e056cf69207b4f50924ehkuang// perform 8x8 transpose
63191037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
63291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
63391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
63491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
63591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
63691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
63791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
63891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
63991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
64091037db265ecdd914a26e056cf69207b4f50924ehkuang
64191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
64291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
64391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
64491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
64591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
64691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
64791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
64891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
64991037db265ecdd914a26e056cf69207b4f50924ehkuang
65091037db265ecdd914a26e056cf69207b4f50924ehkuang  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
65191037db265ecdd914a26e056cf69207b4f50924ehkuang  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
65291037db265ecdd914a26e056cf69207b4f50924ehkuang  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
65391037db265ecdd914a26e056cf69207b4f50924ehkuang  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
65491037db265ecdd914a26e056cf69207b4f50924ehkuang  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
65591037db265ecdd914a26e056cf69207b4f50924ehkuang  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
65691037db265ecdd914a26e056cf69207b4f50924ehkuang  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
65791037db265ecdd914a26e056cf69207b4f50924ehkuang  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
65891037db265ecdd914a26e056cf69207b4f50924ehkuang}
65991037db265ecdd914a26e056cf69207b4f50924ehkuang
660b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
661b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
662b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
663b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
664b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
665b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
666b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
667b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
668b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
669b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
670b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
671b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
672b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
673b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
674b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
675b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}
676b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
677b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void idct8_sse2(__m128i *in) {
67891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
67991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
68091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
68191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
68291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
68391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
68491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
68591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
68691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
68791037db265ecdd914a26e056cf69207b4f50924ehkuang
68891037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
68991037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
69091037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
69191037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
69291037db265ecdd914a26e056cf69207b4f50924ehkuang
6935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
694b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
695b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                in0, in1, in2, in3, in4, in5, in6, in7);
69691037db265ecdd914a26e056cf69207b4f50924ehkuang
69791037db265ecdd914a26e056cf69207b4f50924ehkuang  // 4-stage 1D idct8x8
698b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
699b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian           in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
70091037db265ecdd914a26e056cf69207b4f50924ehkuang}
70191037db265ecdd914a26e056cf69207b4f50924ehkuang
702b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void iadst8_sse2(__m128i *in) {
70391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
70491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
70591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
70691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
70791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
70891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
70991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
71091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
71191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
71291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
71391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
71491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
71591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
71691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__const_0 = _mm_set1_epi16(0);
71791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
71891037db265ecdd914a26e056cf69207b4f50924ehkuang
71991037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
72091037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
72191037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
72291037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
72391037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
72491037db265ecdd914a26e056cf69207b4f50924ehkuang
72591037db265ecdd914a26e056cf69207b4f50924ehkuang  // transpose
72691037db265ecdd914a26e056cf69207b4f50924ehkuang  array_transpose_8x8(in, in);
72791037db265ecdd914a26e056cf69207b4f50924ehkuang
72891037db265ecdd914a26e056cf69207b4f50924ehkuang  // properly aligned for butterfly input
72991037db265ecdd914a26e056cf69207b4f50924ehkuang  in0  = in[7];
73091037db265ecdd914a26e056cf69207b4f50924ehkuang  in1  = in[0];
73191037db265ecdd914a26e056cf69207b4f50924ehkuang  in2  = in[5];
73291037db265ecdd914a26e056cf69207b4f50924ehkuang  in3  = in[2];
73391037db265ecdd914a26e056cf69207b4f50924ehkuang  in4  = in[3];
73491037db265ecdd914a26e056cf69207b4f50924ehkuang  in5  = in[4];
73591037db265ecdd914a26e056cf69207b4f50924ehkuang  in6  = in[1];
73691037db265ecdd914a26e056cf69207b4f50924ehkuang  in7  = in[6];
73791037db265ecdd914a26e056cf69207b4f50924ehkuang
73891037db265ecdd914a26e056cf69207b4f50924ehkuang  // column transformation
73991037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 1
74091037db265ecdd914a26e056cf69207b4f50924ehkuang  // interleave and multiply/add into 32-bit integer
74191037db265ecdd914a26e056cf69207b4f50924ehkuang  s0 = _mm_unpacklo_epi16(in0, in1);
74291037db265ecdd914a26e056cf69207b4f50924ehkuang  s1 = _mm_unpackhi_epi16(in0, in1);
74391037db265ecdd914a26e056cf69207b4f50924ehkuang  s2 = _mm_unpacklo_epi16(in2, in3);
74491037db265ecdd914a26e056cf69207b4f50924ehkuang  s3 = _mm_unpackhi_epi16(in2, in3);
74591037db265ecdd914a26e056cf69207b4f50924ehkuang  s4 = _mm_unpacklo_epi16(in4, in5);
74691037db265ecdd914a26e056cf69207b4f50924ehkuang  s5 = _mm_unpackhi_epi16(in4, in5);
74791037db265ecdd914a26e056cf69207b4f50924ehkuang  s6 = _mm_unpacklo_epi16(in6, in7);
74891037db265ecdd914a26e056cf69207b4f50924ehkuang  s7 = _mm_unpackhi_epi16(in6, in7);
74991037db265ecdd914a26e056cf69207b4f50924ehkuang
75091037db265ecdd914a26e056cf69207b4f50924ehkuang  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
75191037db265ecdd914a26e056cf69207b4f50924ehkuang  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
75291037db265ecdd914a26e056cf69207b4f50924ehkuang  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
75391037db265ecdd914a26e056cf69207b4f50924ehkuang  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
75491037db265ecdd914a26e056cf69207b4f50924ehkuang  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
75591037db265ecdd914a26e056cf69207b4f50924ehkuang  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
75691037db265ecdd914a26e056cf69207b4f50924ehkuang  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
75791037db265ecdd914a26e056cf69207b4f50924ehkuang  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
75891037db265ecdd914a26e056cf69207b4f50924ehkuang  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
75991037db265ecdd914a26e056cf69207b4f50924ehkuang  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
76091037db265ecdd914a26e056cf69207b4f50924ehkuang  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
76191037db265ecdd914a26e056cf69207b4f50924ehkuang  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
76291037db265ecdd914a26e056cf69207b4f50924ehkuang  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
76391037db265ecdd914a26e056cf69207b4f50924ehkuang  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
76491037db265ecdd914a26e056cf69207b4f50924ehkuang  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
76591037db265ecdd914a26e056cf69207b4f50924ehkuang  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
76691037db265ecdd914a26e056cf69207b4f50924ehkuang
76791037db265ecdd914a26e056cf69207b4f50924ehkuang  // addition
76891037db265ecdd914a26e056cf69207b4f50924ehkuang  w0 = _mm_add_epi32(u0, u8);
76991037db265ecdd914a26e056cf69207b4f50924ehkuang  w1 = _mm_add_epi32(u1, u9);
77091037db265ecdd914a26e056cf69207b4f50924ehkuang  w2 = _mm_add_epi32(u2, u10);
77191037db265ecdd914a26e056cf69207b4f50924ehkuang  w3 = _mm_add_epi32(u3, u11);
77291037db265ecdd914a26e056cf69207b4f50924ehkuang  w4 = _mm_add_epi32(u4, u12);
77391037db265ecdd914a26e056cf69207b4f50924ehkuang  w5 = _mm_add_epi32(u5, u13);
77491037db265ecdd914a26e056cf69207b4f50924ehkuang  w6 = _mm_add_epi32(u6, u14);
77591037db265ecdd914a26e056cf69207b4f50924ehkuang  w7 = _mm_add_epi32(u7, u15);
77691037db265ecdd914a26e056cf69207b4f50924ehkuang  w8 = _mm_sub_epi32(u0, u8);
77791037db265ecdd914a26e056cf69207b4f50924ehkuang  w9 = _mm_sub_epi32(u1, u9);
77891037db265ecdd914a26e056cf69207b4f50924ehkuang  w10 = _mm_sub_epi32(u2, u10);
77991037db265ecdd914a26e056cf69207b4f50924ehkuang  w11 = _mm_sub_epi32(u3, u11);
78091037db265ecdd914a26e056cf69207b4f50924ehkuang  w12 = _mm_sub_epi32(u4, u12);
78191037db265ecdd914a26e056cf69207b4f50924ehkuang  w13 = _mm_sub_epi32(u5, u13);
78291037db265ecdd914a26e056cf69207b4f50924ehkuang  w14 = _mm_sub_epi32(u6, u14);
78391037db265ecdd914a26e056cf69207b4f50924ehkuang  w15 = _mm_sub_epi32(u7, u15);
78491037db265ecdd914a26e056cf69207b4f50924ehkuang
78591037db265ecdd914a26e056cf69207b4f50924ehkuang  // shift and rounding
78691037db265ecdd914a26e056cf69207b4f50924ehkuang  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
78791037db265ecdd914a26e056cf69207b4f50924ehkuang  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
78891037db265ecdd914a26e056cf69207b4f50924ehkuang  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
78991037db265ecdd914a26e056cf69207b4f50924ehkuang  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
79091037db265ecdd914a26e056cf69207b4f50924ehkuang  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
79191037db265ecdd914a26e056cf69207b4f50924ehkuang  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
79291037db265ecdd914a26e056cf69207b4f50924ehkuang  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
79391037db265ecdd914a26e056cf69207b4f50924ehkuang  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
79491037db265ecdd914a26e056cf69207b4f50924ehkuang  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
79591037db265ecdd914a26e056cf69207b4f50924ehkuang  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
79691037db265ecdd914a26e056cf69207b4f50924ehkuang  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
79791037db265ecdd914a26e056cf69207b4f50924ehkuang  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
79891037db265ecdd914a26e056cf69207b4f50924ehkuang  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
79991037db265ecdd914a26e056cf69207b4f50924ehkuang  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
80091037db265ecdd914a26e056cf69207b4f50924ehkuang  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
80191037db265ecdd914a26e056cf69207b4f50924ehkuang  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
80291037db265ecdd914a26e056cf69207b4f50924ehkuang
80391037db265ecdd914a26e056cf69207b4f50924ehkuang  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
80491037db265ecdd914a26e056cf69207b4f50924ehkuang  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
80591037db265ecdd914a26e056cf69207b4f50924ehkuang  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
80691037db265ecdd914a26e056cf69207b4f50924ehkuang  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
80791037db265ecdd914a26e056cf69207b4f50924ehkuang  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
80891037db265ecdd914a26e056cf69207b4f50924ehkuang  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
80991037db265ecdd914a26e056cf69207b4f50924ehkuang  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
81091037db265ecdd914a26e056cf69207b4f50924ehkuang  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
81191037db265ecdd914a26e056cf69207b4f50924ehkuang  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
81291037db265ecdd914a26e056cf69207b4f50924ehkuang  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
81391037db265ecdd914a26e056cf69207b4f50924ehkuang  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
81491037db265ecdd914a26e056cf69207b4f50924ehkuang  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
81591037db265ecdd914a26e056cf69207b4f50924ehkuang  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
81691037db265ecdd914a26e056cf69207b4f50924ehkuang  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
81791037db265ecdd914a26e056cf69207b4f50924ehkuang  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
81891037db265ecdd914a26e056cf69207b4f50924ehkuang  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
81991037db265ecdd914a26e056cf69207b4f50924ehkuang
82091037db265ecdd914a26e056cf69207b4f50924ehkuang  // back to 16-bit and pack 8 integers into __m128i
82191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_packs_epi32(u0, u1);
82291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_packs_epi32(u2, u3);
82391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_packs_epi32(u4, u5);
82491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_packs_epi32(u6, u7);
82591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = _mm_packs_epi32(u8, u9);
82691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_packs_epi32(u10, u11);
82791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = _mm_packs_epi32(u12, u13);
82891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_packs_epi32(u14, u15);
82991037db265ecdd914a26e056cf69207b4f50924ehkuang
83091037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 2
83191037db265ecdd914a26e056cf69207b4f50924ehkuang  s0 = _mm_add_epi16(in[0], in[2]);
83291037db265ecdd914a26e056cf69207b4f50924ehkuang  s1 = _mm_add_epi16(in[1], in[3]);
83391037db265ecdd914a26e056cf69207b4f50924ehkuang  s2 = _mm_sub_epi16(in[0], in[2]);
83491037db265ecdd914a26e056cf69207b4f50924ehkuang  s3 = _mm_sub_epi16(in[1], in[3]);
83591037db265ecdd914a26e056cf69207b4f50924ehkuang  u0 = _mm_unpacklo_epi16(in[4], in[5]);
83691037db265ecdd914a26e056cf69207b4f50924ehkuang  u1 = _mm_unpackhi_epi16(in[4], in[5]);
83791037db265ecdd914a26e056cf69207b4f50924ehkuang  u2 = _mm_unpacklo_epi16(in[6], in[7]);
83891037db265ecdd914a26e056cf69207b4f50924ehkuang  u3 = _mm_unpackhi_epi16(in[6], in[7]);
83991037db265ecdd914a26e056cf69207b4f50924ehkuang
84091037db265ecdd914a26e056cf69207b4f50924ehkuang  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
84191037db265ecdd914a26e056cf69207b4f50924ehkuang  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
84291037db265ecdd914a26e056cf69207b4f50924ehkuang  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
84391037db265ecdd914a26e056cf69207b4f50924ehkuang  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
84491037db265ecdd914a26e056cf69207b4f50924ehkuang  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
84591037db265ecdd914a26e056cf69207b4f50924ehkuang  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
84691037db265ecdd914a26e056cf69207b4f50924ehkuang  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
84791037db265ecdd914a26e056cf69207b4f50924ehkuang  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
84891037db265ecdd914a26e056cf69207b4f50924ehkuang
84991037db265ecdd914a26e056cf69207b4f50924ehkuang  w0 = _mm_add_epi32(v0, v4);
85091037db265ecdd914a26e056cf69207b4f50924ehkuang  w1 = _mm_add_epi32(v1, v5);
85191037db265ecdd914a26e056cf69207b4f50924ehkuang  w2 = _mm_add_epi32(v2, v6);
85291037db265ecdd914a26e056cf69207b4f50924ehkuang  w3 = _mm_add_epi32(v3, v7);
85391037db265ecdd914a26e056cf69207b4f50924ehkuang  w4 = _mm_sub_epi32(v0, v4);
85491037db265ecdd914a26e056cf69207b4f50924ehkuang  w5 = _mm_sub_epi32(v1, v5);
85591037db265ecdd914a26e056cf69207b4f50924ehkuang  w6 = _mm_sub_epi32(v2, v6);
85691037db265ecdd914a26e056cf69207b4f50924ehkuang  w7 = _mm_sub_epi32(v3, v7);
85791037db265ecdd914a26e056cf69207b4f50924ehkuang
85891037db265ecdd914a26e056cf69207b4f50924ehkuang  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
85991037db265ecdd914a26e056cf69207b4f50924ehkuang  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
86091037db265ecdd914a26e056cf69207b4f50924ehkuang  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
86191037db265ecdd914a26e056cf69207b4f50924ehkuang  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
86291037db265ecdd914a26e056cf69207b4f50924ehkuang  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
86391037db265ecdd914a26e056cf69207b4f50924ehkuang  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
86491037db265ecdd914a26e056cf69207b4f50924ehkuang  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
86591037db265ecdd914a26e056cf69207b4f50924ehkuang  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
86691037db265ecdd914a26e056cf69207b4f50924ehkuang
86791037db265ecdd914a26e056cf69207b4f50924ehkuang  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
86891037db265ecdd914a26e056cf69207b4f50924ehkuang  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
86991037db265ecdd914a26e056cf69207b4f50924ehkuang  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
87091037db265ecdd914a26e056cf69207b4f50924ehkuang  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
87191037db265ecdd914a26e056cf69207b4f50924ehkuang  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
87291037db265ecdd914a26e056cf69207b4f50924ehkuang  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
87391037db265ecdd914a26e056cf69207b4f50924ehkuang  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
87491037db265ecdd914a26e056cf69207b4f50924ehkuang  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
87591037db265ecdd914a26e056cf69207b4f50924ehkuang
87691037db265ecdd914a26e056cf69207b4f50924ehkuang  // back to 16-bit intergers
87791037db265ecdd914a26e056cf69207b4f50924ehkuang  s4 = _mm_packs_epi32(u0, u1);
87891037db265ecdd914a26e056cf69207b4f50924ehkuang  s5 = _mm_packs_epi32(u2, u3);
87991037db265ecdd914a26e056cf69207b4f50924ehkuang  s6 = _mm_packs_epi32(u4, u5);
88091037db265ecdd914a26e056cf69207b4f50924ehkuang  s7 = _mm_packs_epi32(u6, u7);
88191037db265ecdd914a26e056cf69207b4f50924ehkuang
88291037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 3
88391037db265ecdd914a26e056cf69207b4f50924ehkuang  u0 = _mm_unpacklo_epi16(s2, s3);
88491037db265ecdd914a26e056cf69207b4f50924ehkuang  u1 = _mm_unpackhi_epi16(s2, s3);
88591037db265ecdd914a26e056cf69207b4f50924ehkuang  u2 = _mm_unpacklo_epi16(s6, s7);
88691037db265ecdd914a26e056cf69207b4f50924ehkuang  u3 = _mm_unpackhi_epi16(s6, s7);
88791037db265ecdd914a26e056cf69207b4f50924ehkuang
88891037db265ecdd914a26e056cf69207b4f50924ehkuang  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
88991037db265ecdd914a26e056cf69207b4f50924ehkuang  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
89091037db265ecdd914a26e056cf69207b4f50924ehkuang  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
89191037db265ecdd914a26e056cf69207b4f50924ehkuang  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
89291037db265ecdd914a26e056cf69207b4f50924ehkuang  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
89391037db265ecdd914a26e056cf69207b4f50924ehkuang  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
89491037db265ecdd914a26e056cf69207b4f50924ehkuang  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
89591037db265ecdd914a26e056cf69207b4f50924ehkuang  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
89691037db265ecdd914a26e056cf69207b4f50924ehkuang
89791037db265ecdd914a26e056cf69207b4f50924ehkuang  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
89891037db265ecdd914a26e056cf69207b4f50924ehkuang  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
89991037db265ecdd914a26e056cf69207b4f50924ehkuang  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
90091037db265ecdd914a26e056cf69207b4f50924ehkuang  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
90191037db265ecdd914a26e056cf69207b4f50924ehkuang  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
90291037db265ecdd914a26e056cf69207b4f50924ehkuang  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
90391037db265ecdd914a26e056cf69207b4f50924ehkuang  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
90491037db265ecdd914a26e056cf69207b4f50924ehkuang  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
90591037db265ecdd914a26e056cf69207b4f50924ehkuang
90691037db265ecdd914a26e056cf69207b4f50924ehkuang  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
90791037db265ecdd914a26e056cf69207b4f50924ehkuang  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
90891037db265ecdd914a26e056cf69207b4f50924ehkuang  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
90991037db265ecdd914a26e056cf69207b4f50924ehkuang  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
91091037db265ecdd914a26e056cf69207b4f50924ehkuang  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
91191037db265ecdd914a26e056cf69207b4f50924ehkuang  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
91291037db265ecdd914a26e056cf69207b4f50924ehkuang  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
91391037db265ecdd914a26e056cf69207b4f50924ehkuang  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
91491037db265ecdd914a26e056cf69207b4f50924ehkuang
91591037db265ecdd914a26e056cf69207b4f50924ehkuang  s2 = _mm_packs_epi32(v0, v1);
91691037db265ecdd914a26e056cf69207b4f50924ehkuang  s3 = _mm_packs_epi32(v2, v3);
91791037db265ecdd914a26e056cf69207b4f50924ehkuang  s6 = _mm_packs_epi32(v4, v5);
91891037db265ecdd914a26e056cf69207b4f50924ehkuang  s7 = _mm_packs_epi32(v6, v7);
91991037db265ecdd914a26e056cf69207b4f50924ehkuang
92091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = s0;
92191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_sub_epi16(k__const_0, s4);
92291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = s6;
92391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_sub_epi16(k__const_0, s2);
92491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = s3;
92591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_sub_epi16(k__const_0, s7);
92691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = s5;
92791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_sub_epi16(k__const_0, s1);
92891037db265ecdd914a26e056cf69207b4f50924ehkuang}
92991037db265ecdd914a26e056cf69207b4f50924ehkuang
93091037db265ecdd914a26e056cf69207b4f50924ehkuang
9315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
9325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            int tx_type) {
93391037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i in[8];
93491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i zero = _mm_setzero_si128();
93591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i final_rounding = _mm_set1_epi16(1<<4);
93691037db265ecdd914a26e056cf69207b4f50924ehkuang
93791037db265ecdd914a26e056cf69207b4f50924ehkuang  // load input data
9385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[0] = _mm_load_si128((const __m128i *)input);
9395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1));
9405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2));
9415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3));
9425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4));
9435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5));
9445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6));
9455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));
94691037db265ecdd914a26e056cf69207b4f50924ehkuang
94791037db265ecdd914a26e056cf69207b4f50924ehkuang  switch (tx_type) {
94891037db265ecdd914a26e056cf69207b4f50924ehkuang    case 0:  // DCT_DCT
949b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      idct8_sse2(in);
950b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      idct8_sse2(in);
95191037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
95291037db265ecdd914a26e056cf69207b4f50924ehkuang    case 1:  // ADST_DCT
953b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      idct8_sse2(in);
954b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      iadst8_sse2(in);
95591037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
95691037db265ecdd914a26e056cf69207b4f50924ehkuang    case 2:  // DCT_ADST
957b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      iadst8_sse2(in);
958b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      idct8_sse2(in);
95991037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
96091037db265ecdd914a26e056cf69207b4f50924ehkuang    case 3:  // ADST_ADST
961b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      iadst8_sse2(in);
962b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      iadst8_sse2(in);
96391037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
96491037db265ecdd914a26e056cf69207b4f50924ehkuang    default:
96591037db265ecdd914a26e056cf69207b4f50924ehkuang      assert(0);
96691037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
96791037db265ecdd914a26e056cf69207b4f50924ehkuang  }
96891037db265ecdd914a26e056cf69207b4f50924ehkuang
96991037db265ecdd914a26e056cf69207b4f50924ehkuang  // Final rounding and shift
97091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_adds_epi16(in[0], final_rounding);
97191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_adds_epi16(in[1], final_rounding);
97291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_adds_epi16(in[2], final_rounding);
97391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_adds_epi16(in[3], final_rounding);
97491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = _mm_adds_epi16(in[4], final_rounding);
97591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_adds_epi16(in[5], final_rounding);
97691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = _mm_adds_epi16(in[6], final_rounding);
97791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_adds_epi16(in[7], final_rounding);
97891037db265ecdd914a26e056cf69207b4f50924ehkuang
97991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_srai_epi16(in[0], 5);
98091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_srai_epi16(in[1], 5);
98191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_srai_epi16(in[2], 5);
98291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_srai_epi16(in[3], 5);
98391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = _mm_srai_epi16(in[4], 5);
98491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_srai_epi16(in[5], 5);
98591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = _mm_srai_epi16(in[6], 5);
98691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_srai_epi16(in[7], 5);
98791037db265ecdd914a26e056cf69207b4f50924ehkuang
98891037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[0]);
98991037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[1]);
99091037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[2]);
99191037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[3]);
99291037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[4]);
99391037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[5]);
99491037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[6]);
99591037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[7]);
99691037db265ecdd914a26e056cf69207b4f50924ehkuang}
99791037db265ecdd914a26e056cf69207b4f50924ehkuang
9985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
999ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i zero = _mm_setzero_si128();
1000ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
1001ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i final_rounding = _mm_set1_epi16(1<<4);
1002ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1003ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
1004ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
1005ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
1006ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
1007ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1008ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1009ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
1010ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1011ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1012ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
1013ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
1014ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
1015ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1016ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1017ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Rows. Load 4-row input data.
10185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in0 = _mm_load_si128((const __m128i *)input);
10195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
10205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
10215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
1022ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1023ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // 8x4 Transpose
1024b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
1025ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage1
10265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  { //NOLINT
1027b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
1028b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
1029ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1030ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_17, stg1_0);
1031ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_17, stg1_1);
1032ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_madd_epi16(lo_35, stg1_2);
1033ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_madd_epi16(lo_35, stg1_3);
1034ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1035ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding);
1036ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding);
1037ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_add_epi32(tmp4, rounding);
1038ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_add_epi32(tmp6, rounding);
1039ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1040ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1041ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
1042ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
1043ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1044b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
1045b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp1_5 = _mm_packs_epi32(tmp4, tmp6);
1046ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
1047ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1048ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage2
10495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  { //NOLINT
1050b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
1051b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
1052ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1053ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_04, stg2_0);
1054ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_04, stg2_1);
1055ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_madd_epi16(lo_26, stg2_2);
1056ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_madd_epi16(lo_26, stg2_3);
1057ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1058ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding);
1059ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding);
1060ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_add_epi32(tmp4, rounding);
1061ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_add_epi32(tmp6, rounding);
1062ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1063ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1064ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
1065ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
1066ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1067b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp2_0 = _mm_packs_epi32(tmp0, tmp2);
1068b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp2_2 = _mm_packs_epi32(tmp6, tmp4);
1069ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1070b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
1071b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
1072b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
1073b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp2_4 = tmp0;
1074b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
1075b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
1076ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
1077ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1078ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage3
10795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  { //NOLINT
1080ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
1081b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
1082b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
1083b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
1084b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
1085b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
1086b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
1087ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1088ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_56, stg3_0);
1089ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
1090ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1091ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding);
1092ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding);
1093ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1094ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1095ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1096b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp1_5 = _mm_packs_epi32(tmp0, tmp2);
1097ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
1098ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1099ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage4
1100b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
1101b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
1102b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
1103b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
1104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1105b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
1106b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
1107b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
1108b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian           in0, in1, in2, in3, in4, in5, in6, in7);
1109ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Final rounding and shift
1110ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in0 = _mm_adds_epi16(in0, final_rounding);
1111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in1 = _mm_adds_epi16(in1, final_rounding);
1112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in2 = _mm_adds_epi16(in2, final_rounding);
1113ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in3 = _mm_adds_epi16(in3, final_rounding);
1114ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in4 = _mm_adds_epi16(in4, final_rounding);
1115ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in5 = _mm_adds_epi16(in5, final_rounding);
1116ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in6 = _mm_adds_epi16(in6, final_rounding);
1117ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in7 = _mm_adds_epi16(in7, final_rounding);
1118ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1119ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in0 = _mm_srai_epi16(in0, 5);
1120ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in1 = _mm_srai_epi16(in1, 5);
1121ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in2 = _mm_srai_epi16(in2, 5);
1122ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in3 = _mm_srai_epi16(in3, 5);
1123ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in4 = _mm_srai_epi16(in4, 5);
1124ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in5 = _mm_srai_epi16(in5, 5);
1125ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in6 = _mm_srai_epi16(in6, 5);
1126ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in7 = _mm_srai_epi16(in7, 5);
1127ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1128ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in0);
1129ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in1);
1130ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in2);
1131ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in3);
1132ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in4);
1133ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in5);
1134ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in6);
1135ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in7);
1136ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
1137ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1138b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#define IDCT16 \
1139ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage2 */ \
1140ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
1141b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
1142b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
1143b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);   \
1144b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);   \
1145b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
1146b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
1147b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
1148b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
1149ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1150ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
1151ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stg2_0, stg2_1, stg2_2, stg2_3, \
1152ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stp2_8, stp2_15, stp2_9, stp2_14) \
1153ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1154ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
1155ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stg2_4, stg2_5, stg2_6, stg2_7, \
1156ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stp2_10, stp2_13, stp2_11, stp2_12) \
1157ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  } \
1158ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1159ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage3 */ \
1160ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
1161b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
1162b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
1163b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
1164b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
1165ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1166ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
1167ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stg3_0, stg3_1, stg3_2, stg3_3, \
1168ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stp1_4, stp1_7, stp1_5, stp1_6) \
1169ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1170ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);  \
1171ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);    \
1172ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
1173ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
1174ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1175ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
1176ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
1177ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
1178ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
1179ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  } \
1180ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  \
1181ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage4 */ \
1182ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
1183b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
1184b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
1185b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
1186b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
1187ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1188ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
1189ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
1190ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1191ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1192ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1193ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
1194ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stg4_0, stg4_1, stg4_2, stg4_3, \
1195ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stp2_0, stp2_1, stp2_2, stp2_3) \
1196ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1197ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
1198ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
1199ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
1200ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
1201ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1202ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
1203ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stg4_4, stg4_5, stg4_6, stg4_7, \
1204ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stp2_9, stp2_14, stp2_10, stp2_13) \
1205ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  } \
1206ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1207ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage5 */ \
1208ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
1209ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
1210ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
1211ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1212ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
1213ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
1214ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
1215ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
1216ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1217ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
1218ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
1219ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
1220ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
1221ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1222ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding); \
1223ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_add_epi32(tmp1, rounding); \
1224ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding); \
1225ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_add_epi32(tmp3, rounding); \
1226ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1227ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
1228ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
1229ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
1230ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
1231ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1232ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
1233ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
1234ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1235ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
1236ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
1237ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
1238ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
1239ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1240ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
1241ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
1242ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
1243ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
1244ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  } \
1245ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1246ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage6 */ \
1247ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
1248ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1249ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1250ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
1251ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
1252ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1253ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
1254ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
1255ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
1256ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
1257ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
1258ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1259ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1260ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1261ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1262ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
1263ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stg6_0, stg4_0, stg6_0, stg4_0, \
1264ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stp2_10, stp2_13, stp2_11, stp2_12) \
1265ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
1266ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1267b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#define IDCT16_10 \
1268b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    /* Stage2 */ \
1269b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    { \
1270b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
1271b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
1272b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
1273b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
1274b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      \
1275b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \
1276b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                             stg2_0, stg2_1, stg2_6, stg2_7, \
1277b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                             stp1_8_0, stp1_15, stp1_11, stp1_12_0) \
1278b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    } \
1279b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      \
1280b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    /* Stage3 */ \
1281b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    { \
1282b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
1283b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
1284b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      \
1285b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \
1286b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                               stg3_0, stg3_1,  \
1287b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                               stp2_4, stp2_7) \
1288b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      \
1289b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      stp1_9  =  stp1_8_0; \
1290b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      stp1_10 =  stp1_11;  \
1291b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      \
1292b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      stp1_13 = stp1_12_0; \
1293b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      stp1_14 = stp1_15;   \
1294b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    } \
1295b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    \
1296b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    /* Stage4 */ \
1297b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    { \
1298b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
1299b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
1300b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      \
1301b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
1302b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
1303b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1304b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1305b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      \
1306b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \
1307b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                               stg4_0, stg4_1, \
1308b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                               stp1_0, stp1_1) \
1309b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      stp2_5 = stp2_4; \
1310b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      stp2_6 = stp2_7; \
1311b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      \
1312b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
1313b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                             stg4_4, stg4_5, stg4_6, stg4_7, \
1314b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                             stp2_9, stp2_14, stp2_10, stp2_13) \
1315b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    } \
1316b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      \
1317b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    /* Stage5 */ \
1318b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    { \
1319b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
1320b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
1321b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      \
1322b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      stp1_2 = stp1_1; \
1323b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      stp1_3 = stp1_0; \
1324b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      \
1325b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
1326b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
1327b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
1328b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
1329b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      \
1330b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp0 = _mm_add_epi32(tmp0, rounding); \
1331b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp1 = _mm_add_epi32(tmp1, rounding); \
1332b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp2 = _mm_add_epi32(tmp2, rounding); \
1333b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp3 = _mm_add_epi32(tmp3, rounding); \
1334b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      \
1335b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
1336b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
1337b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
1338b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
1339b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      \
1340b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
1341b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
1342b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      \
1343b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
1344b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
1345b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
1346b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
1347b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      \
1348b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
1349b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
1350b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
1351b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
1352b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    } \
1353b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      \
1354b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    /* Stage6 */ \
1355b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    { \
1356b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1357b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1358b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
1359b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
1360b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      \
1361b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
1362b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
1363b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
1364b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
1365b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
1366b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1367b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1368b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1369b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      \
1370b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
1371b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                             stg6_0, stg4_0, stg6_0, stg4_0, \
1372b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                             stp2_10, stp2_13, stp2_11, stp2_12) \
1373b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    }
1374b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
13755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
13765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                int stride) {
1377ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
1378ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i final_rounding = _mm_set1_epi16(1<<5);
1379ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i zero = _mm_setzero_si128();
1380ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1381ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1382ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
1383ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1384ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
1385ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1386ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
1387ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1388ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
1389ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1390ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1391ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
1392ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1393ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
1394ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1395ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
1396ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1397ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1398ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
1399ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1400ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
1401ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1402ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1403ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1404ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1405ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1406b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i in[16], l[16], r[16], *curr1;
1407ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
1408ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
1409ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_8_0, stp1_12_0;
1410ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
1411ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
1412ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1413ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int i;
1414ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1415b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  curr1 = l;
1416b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  for (i = 0; i < 2; i++) {
1417b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 1-D idct
1418ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1419ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Load input data.
1420b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[0] = _mm_load_si128((const __m128i *)input);
1421b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
1422b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
1423b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
1424b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
1425b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
1426b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
1427b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
1428b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8));
1429b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9));
1430b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10));
1431b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11));
1432b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12));
1433b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13));
1434b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14));
1435b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15));
1436b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
1437b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      array_transpose_8x8(in, in);
1438b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      array_transpose_8x8(in+8, in+8);
1439b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
1440b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      IDCT16
1441b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
1442b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // Stage7
1443b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
1444b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
1445b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
1446b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
1447b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
1448b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
1449b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
1450b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
1451b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
1452b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
1453b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
1454b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
1455b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
1456b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
1457b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
1458b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
1459b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
1460b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      curr1 = r;
1461b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      input += 128;
1462b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  }
1463b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  for (i = 0; i < 2; i++) {
1464b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 1-D idct
1465b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      array_transpose_8x8(l+i*8, in);
1466b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      array_transpose_8x8(r+i*8, in+8);
1467ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1468b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      IDCT16
1469ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1470ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // 2-D
1471b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[0] = _mm_add_epi16(stp2_0, stp1_15);
1472b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[1] = _mm_add_epi16(stp2_1, stp1_14);
1473b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[2] = _mm_add_epi16(stp2_2, stp2_13);
1474b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[3] = _mm_add_epi16(stp2_3, stp2_12);
1475b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[4] = _mm_add_epi16(stp2_4, stp2_11);
1476b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[5] = _mm_add_epi16(stp2_5, stp2_10);
1477b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[6] = _mm_add_epi16(stp2_6, stp1_9);
1478b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[7] = _mm_add_epi16(stp2_7, stp1_8);
1479b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[8] = _mm_sub_epi16(stp2_7, stp1_8);
1480b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[9] = _mm_sub_epi16(stp2_6, stp1_9);
1481b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[10] = _mm_sub_epi16(stp2_5, stp2_10);
1482b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[11] = _mm_sub_epi16(stp2_4, stp2_11);
1483b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[12] = _mm_sub_epi16(stp2_3, stp2_12);
1484b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[13] = _mm_sub_epi16(stp2_2, stp2_13);
1485b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[14] = _mm_sub_epi16(stp2_1, stp1_14);
1486b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[15] = _mm_sub_epi16(stp2_0, stp1_15);
1487ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1488ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Final rounding and shift
1489b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[0] = _mm_adds_epi16(in[0], final_rounding);
1490b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[1] = _mm_adds_epi16(in[1], final_rounding);
1491b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[2] = _mm_adds_epi16(in[2], final_rounding);
1492b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[3] = _mm_adds_epi16(in[3], final_rounding);
1493b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[4] = _mm_adds_epi16(in[4], final_rounding);
1494b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[5] = _mm_adds_epi16(in[5], final_rounding);
1495b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[6] = _mm_adds_epi16(in[6], final_rounding);
1496b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[7] = _mm_adds_epi16(in[7], final_rounding);
1497b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[8] = _mm_adds_epi16(in[8], final_rounding);
1498b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[9] = _mm_adds_epi16(in[9], final_rounding);
1499b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[10] = _mm_adds_epi16(in[10], final_rounding);
1500b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[11] = _mm_adds_epi16(in[11], final_rounding);
1501b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[12] = _mm_adds_epi16(in[12], final_rounding);
1502b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[13] = _mm_adds_epi16(in[13], final_rounding);
1503b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[14] = _mm_adds_epi16(in[14], final_rounding);
1504b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[15] = _mm_adds_epi16(in[15], final_rounding);
1505b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
1506b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[0] = _mm_srai_epi16(in[0], 6);
1507b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[1] = _mm_srai_epi16(in[1], 6);
1508b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[2] = _mm_srai_epi16(in[2], 6);
1509b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[3] = _mm_srai_epi16(in[3], 6);
1510b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[4] = _mm_srai_epi16(in[4], 6);
1511b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[5] = _mm_srai_epi16(in[5], 6);
1512b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[6] = _mm_srai_epi16(in[6], 6);
1513b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[7] = _mm_srai_epi16(in[7], 6);
1514b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[8] = _mm_srai_epi16(in[8], 6);
1515b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[9] = _mm_srai_epi16(in[9], 6);
1516b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[10] = _mm_srai_epi16(in[10], 6);
1517b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[11] = _mm_srai_epi16(in[11], 6);
1518b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[12] = _mm_srai_epi16(in[12], 6);
1519b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[13] = _mm_srai_epi16(in[13], 6);
1520b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[14] = _mm_srai_epi16(in[14], 6);
1521b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[15] = _mm_srai_epi16(in[15], 6);
1522b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
1523b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[0]);
1524b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[1]);
1525b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[2]);
1526b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[3]);
1527b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[4]);
1528b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[5]);
1529b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[6]);
1530b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[7]);
1531b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[8]);
1532b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[9]);
1533b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[10]);
1534b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[11]);
1535b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[12]);
1536b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[13]);
1537b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[14]);
1538b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[15]);
1539ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1540ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      dest += 8 - (stride * 16);
1541ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
1542ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
1543ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
15445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
1545f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  __m128i dc_value;
1546f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  const __m128i zero = _mm_setzero_si128();
1547f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  int a, i;
1548f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
1549f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  a = dct_const_round_shift(input[0] * cospi_16_64);
1550f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  a = dct_const_round_shift(a * cospi_16_64);
1551f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  a = ROUND_POWER_OF_TWO(a, 6);
1552f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
1553f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  dc_value = _mm_set1_epi16(a);
1554f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
1555f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  for (i = 0; i < 2; ++i) {
1556f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1557f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1558f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1559f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1560f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1561f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1562f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1563f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1564f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1565f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1566f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1567f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1568f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1569f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1570f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1571f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1572f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    dest += 8 - (stride * 16);
1573f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  }
1574f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang}
1575f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
157691037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
157791037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i tbuf[8];
157891037db265ecdd914a26e056cf69207b4f50924ehkuang  array_transpose_8x8(res0, res0);
157991037db265ecdd914a26e056cf69207b4f50924ehkuang  array_transpose_8x8(res1, tbuf);
158091037db265ecdd914a26e056cf69207b4f50924ehkuang  array_transpose_8x8(res0 + 8, res1);
158191037db265ecdd914a26e056cf69207b4f50924ehkuang  array_transpose_8x8(res1 + 8, res1 + 8);
158291037db265ecdd914a26e056cf69207b4f50924ehkuang
158391037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[8] = tbuf[0];
158491037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[9] = tbuf[1];
158591037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[10] = tbuf[2];
158691037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[11] = tbuf[3];
158791037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[12] = tbuf[4];
158891037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[13] = tbuf[5];
158991037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[14] = tbuf[6];
159091037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[15] = tbuf[7];
159191037db265ecdd914a26e056cf69207b4f50924ehkuang}
159291037db265ecdd914a26e056cf69207b4f50924ehkuang
1593b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void iadst16_8col(__m128i *in) {
159491037db265ecdd914a26e056cf69207b4f50924ehkuang  // perform 16x16 1-D ADST for 8 columns
159591037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i s[16], x[16], u[32], v[32];
159691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
159791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
159891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
159991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
160091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
160191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
160291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
160391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
160491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
160591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
160691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
160791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
160891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
160991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
161091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
161191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
161291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
161391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
161491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
161591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
161691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
161791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
161891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
161991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
162091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
162191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
162291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
162391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
162491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
162591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
162691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i kZero = _mm_set1_epi16(0);
162791037db265ecdd914a26e056cf69207b4f50924ehkuang
162891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
162991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
163091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
163191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
163291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
163391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
163491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
163591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
163691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
163791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
163891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
163991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
164091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
164191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
164291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
164391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
164491037db265ecdd914a26e056cf69207b4f50924ehkuang
164591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
164691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
164791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
164891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
164991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
165091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
165191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
165291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
165391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
165491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
165591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
165691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
165791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
165891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
165991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
166091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
166191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
166291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
166391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
166491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
166591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
166691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
166791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
166891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
166991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
167091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
167191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
167291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
167391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
167491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
167591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
167691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
167791037db265ecdd914a26e056cf69207b4f50924ehkuang
167891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], v[16]);
167991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], v[17]);
168091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], v[18]);
168191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], v[19]);
168291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(v[4], v[20]);
168391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(v[5], v[21]);
168491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(v[6], v[22]);
168591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(v[7], v[23]);
168691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_add_epi32(v[8], v[24]);
168791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_add_epi32(v[9], v[25]);
168891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_add_epi32(v[10], v[26]);
168991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_add_epi32(v[11], v[27]);
169091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_add_epi32(v[12], v[28]);
169191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_add_epi32(v[13], v[29]);
169291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_add_epi32(v[14], v[30]);
169391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_add_epi32(v[15], v[31]);
169491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[16] = _mm_sub_epi32(v[0], v[16]);
169591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[17] = _mm_sub_epi32(v[1], v[17]);
169691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[18] = _mm_sub_epi32(v[2], v[18]);
169791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[19] = _mm_sub_epi32(v[3], v[19]);
169891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[20] = _mm_sub_epi32(v[4], v[20]);
169991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[21] = _mm_sub_epi32(v[5], v[21]);
170091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[22] = _mm_sub_epi32(v[6], v[22]);
170191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[23] = _mm_sub_epi32(v[7], v[23]);
170291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[24] = _mm_sub_epi32(v[8], v[24]);
170391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[25] = _mm_sub_epi32(v[9], v[25]);
170491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[26] = _mm_sub_epi32(v[10], v[26]);
170591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[27] = _mm_sub_epi32(v[11], v[27]);
170691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[28] = _mm_sub_epi32(v[12], v[28]);
170791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[29] = _mm_sub_epi32(v[13], v[29]);
170891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[30] = _mm_sub_epi32(v[14], v[30]);
170991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[31] = _mm_sub_epi32(v[15], v[31]);
171091037db265ecdd914a26e056cf69207b4f50924ehkuang
171191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
171291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
171391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
171491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
171591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
171691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
171791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
171891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
171991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
172091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
172191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
172291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
172391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
172491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
172591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
172691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
172791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
172891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
172991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
173091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
173191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
173291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
173391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
173491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
173591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
173691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
173791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
173891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
173991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
174091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
174191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
174291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
174391037db265ecdd914a26e056cf69207b4f50924ehkuang
174491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
174591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
174691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
174791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
174891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
174991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
175091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
175191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
175291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
175391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
175491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
175591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
175691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
175791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
175891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
175991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
176091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
176191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
176291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
176391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
176491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
176591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
176691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
176791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
176891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
176991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
177091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
177191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
177291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
177391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
177491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
177591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
177691037db265ecdd914a26e056cf69207b4f50924ehkuang
177791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[0] = _mm_packs_epi32(u[0], u[1]);
177891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[1] = _mm_packs_epi32(u[2], u[3]);
177991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[2] = _mm_packs_epi32(u[4], u[5]);
178091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[3] = _mm_packs_epi32(u[6], u[7]);
178191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[4] = _mm_packs_epi32(u[8], u[9]);
178291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[5] = _mm_packs_epi32(u[10], u[11]);
178391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[6] = _mm_packs_epi32(u[12], u[13]);
178491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[7] = _mm_packs_epi32(u[14], u[15]);
178591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[8] = _mm_packs_epi32(u[16], u[17]);
178691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[9] = _mm_packs_epi32(u[18], u[19]);
178791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[10] = _mm_packs_epi32(u[20], u[21]);
178891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[11] = _mm_packs_epi32(u[22], u[23]);
178991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[12] = _mm_packs_epi32(u[24], u[25]);
179091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[13] = _mm_packs_epi32(u[26], u[27]);
179191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[14] = _mm_packs_epi32(u[28], u[29]);
179291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[15] = _mm_packs_epi32(u[30], u[31]);
179391037db265ecdd914a26e056cf69207b4f50924ehkuang
179491037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 2
179591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
179691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
179791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
179891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
179991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
180091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
180191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
180291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
180391037db265ecdd914a26e056cf69207b4f50924ehkuang
180491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
180591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
180691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
180791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
180891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
180991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
181091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
181191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
181291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
181391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
181491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
181591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
181691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
181791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
181891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
181991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
182091037db265ecdd914a26e056cf69207b4f50924ehkuang
182191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], v[8]);
182291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], v[9]);
182391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], v[10]);
182491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], v[11]);
182591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(v[4], v[12]);
182691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(v[5], v[13]);
182791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(v[6], v[14]);
182891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(v[7], v[15]);
182991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_sub_epi32(v[0], v[8]);
183091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_sub_epi32(v[1], v[9]);
183191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_sub_epi32(v[2], v[10]);
183291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_sub_epi32(v[3], v[11]);
183391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_sub_epi32(v[4], v[12]);
183491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_sub_epi32(v[5], v[13]);
183591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_sub_epi32(v[6], v[14]);
183691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_sub_epi32(v[7], v[15]);
183791037db265ecdd914a26e056cf69207b4f50924ehkuang
183891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
183991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
184091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
184191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
184291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
184391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
184491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
184591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
184691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
184791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
184891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
184991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
185091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
185191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
185291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
185391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
185491037db265ecdd914a26e056cf69207b4f50924ehkuang
185591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
185691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
185791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
185891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
185991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
186091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
186191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
186291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
186391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
186491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
186591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
186691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
186791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
186891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
186991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
187091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
187191037db265ecdd914a26e056cf69207b4f50924ehkuang
187291037db265ecdd914a26e056cf69207b4f50924ehkuang  x[0] = _mm_add_epi16(s[0], s[4]);
187391037db265ecdd914a26e056cf69207b4f50924ehkuang  x[1] = _mm_add_epi16(s[1], s[5]);
187491037db265ecdd914a26e056cf69207b4f50924ehkuang  x[2] = _mm_add_epi16(s[2], s[6]);
187591037db265ecdd914a26e056cf69207b4f50924ehkuang  x[3] = _mm_add_epi16(s[3], s[7]);
187691037db265ecdd914a26e056cf69207b4f50924ehkuang  x[4] = _mm_sub_epi16(s[0], s[4]);
187791037db265ecdd914a26e056cf69207b4f50924ehkuang  x[5] = _mm_sub_epi16(s[1], s[5]);
187891037db265ecdd914a26e056cf69207b4f50924ehkuang  x[6] = _mm_sub_epi16(s[2], s[6]);
187991037db265ecdd914a26e056cf69207b4f50924ehkuang  x[7] = _mm_sub_epi16(s[3], s[7]);
188091037db265ecdd914a26e056cf69207b4f50924ehkuang  x[8] = _mm_packs_epi32(u[0], u[1]);
188191037db265ecdd914a26e056cf69207b4f50924ehkuang  x[9] = _mm_packs_epi32(u[2], u[3]);
188291037db265ecdd914a26e056cf69207b4f50924ehkuang  x[10] = _mm_packs_epi32(u[4], u[5]);
188391037db265ecdd914a26e056cf69207b4f50924ehkuang  x[11] = _mm_packs_epi32(u[6], u[7]);
188491037db265ecdd914a26e056cf69207b4f50924ehkuang  x[12] = _mm_packs_epi32(u[8], u[9]);
188591037db265ecdd914a26e056cf69207b4f50924ehkuang  x[13] = _mm_packs_epi32(u[10], u[11]);
188691037db265ecdd914a26e056cf69207b4f50924ehkuang  x[14] = _mm_packs_epi32(u[12], u[13]);
188791037db265ecdd914a26e056cf69207b4f50924ehkuang  x[15] = _mm_packs_epi32(u[14], u[15]);
188891037db265ecdd914a26e056cf69207b4f50924ehkuang
188991037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 3
189091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
189191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
189291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
189391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
189491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
189591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
189691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
189791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
189891037db265ecdd914a26e056cf69207b4f50924ehkuang
189991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
190091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
190191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
190291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
190391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
190491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
190591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
190691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
190791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
190891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
190991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
191091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
191191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
191291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
191391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
191491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
191591037db265ecdd914a26e056cf69207b4f50924ehkuang
191691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], v[4]);
191791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], v[5]);
191891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], v[6]);
191991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], v[7]);
192091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_sub_epi32(v[0], v[4]);
192191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_sub_epi32(v[1], v[5]);
192291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_sub_epi32(v[2], v[6]);
192391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_sub_epi32(v[3], v[7]);
192491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_add_epi32(v[8], v[12]);
192591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_add_epi32(v[9], v[13]);
192691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_add_epi32(v[10], v[14]);
192791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_add_epi32(v[11], v[15]);
192891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_sub_epi32(v[8], v[12]);
192991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_sub_epi32(v[9], v[13]);
193091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_sub_epi32(v[10], v[14]);
193191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_sub_epi32(v[11], v[15]);
193291037db265ecdd914a26e056cf69207b4f50924ehkuang
193391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
193491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
193591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
193691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
193791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
193891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
193991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
194091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
194191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
194291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
194391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
194491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
194591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
194691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
194791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
194891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
194991037db265ecdd914a26e056cf69207b4f50924ehkuang
195091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
195191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
195291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
195391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
195491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
195591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
195691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
195791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
195891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
195991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
196091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
196191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
196291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
196391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
196491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
196591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
196691037db265ecdd914a26e056cf69207b4f50924ehkuang
196791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[0] = _mm_add_epi16(x[0], x[2]);
196891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[1] = _mm_add_epi16(x[1], x[3]);
196991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[2] = _mm_sub_epi16(x[0], x[2]);
197091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[3] = _mm_sub_epi16(x[1], x[3]);
197191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[4] = _mm_packs_epi32(v[0], v[1]);
197291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[5] = _mm_packs_epi32(v[2], v[3]);
197391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[6] = _mm_packs_epi32(v[4], v[5]);
197491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[7] = _mm_packs_epi32(v[6], v[7]);
197591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[8] = _mm_add_epi16(x[8], x[10]);
197691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[9] = _mm_add_epi16(x[9], x[11]);
197791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[10] = _mm_sub_epi16(x[8], x[10]);
197891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[11] = _mm_sub_epi16(x[9], x[11]);
197991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[12] = _mm_packs_epi32(v[8], v[9]);
198091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[13] = _mm_packs_epi32(v[10], v[11]);
198191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[14] = _mm_packs_epi32(v[12], v[13]);
198291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[15] = _mm_packs_epi32(v[14], v[15]);
198391037db265ecdd914a26e056cf69207b4f50924ehkuang
198491037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 4
198591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
198691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
198791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
198891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
198991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
199091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
199191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
199291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
199391037db265ecdd914a26e056cf69207b4f50924ehkuang
199491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
199591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
199691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
199791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
199891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
199991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
200091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
200191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
200291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
200391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
200491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
200591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
200691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
200791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
200891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
200991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
201091037db265ecdd914a26e056cf69207b4f50924ehkuang
201191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
201291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
201391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
201491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
201591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
201691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
201791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
201891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
201991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
202091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
202191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
202291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
202391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
202491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
202591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
202691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
202791037db265ecdd914a26e056cf69207b4f50924ehkuang
202891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
202991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
203091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
203191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
203291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
203391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
203491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
203591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
203691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
203791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
203891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
203991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
204091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
204191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
204291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
204391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
204491037db265ecdd914a26e056cf69207b4f50924ehkuang
204591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = s[0];
204691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_sub_epi16(kZero, s[8]);
204791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = s[12];
204891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_sub_epi16(kZero, s[4]);
204991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = _mm_packs_epi32(v[4], v[5]);
205091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_packs_epi32(v[12], v[13]);
205191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = _mm_packs_epi32(v[8], v[9]);
205291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_packs_epi32(v[0], v[1]);
205391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[8] = _mm_packs_epi32(v[2], v[3]);
205491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[9] = _mm_packs_epi32(v[10], v[11]);
205591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[10] = _mm_packs_epi32(v[14], v[15]);
205691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[11] = _mm_packs_epi32(v[6], v[7]);
205791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[12] = s[5];
205891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[13] = _mm_sub_epi16(kZero, s[13]);
205991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[14] = s[9];
206091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[15] = _mm_sub_epi16(kZero, s[1]);
206191037db265ecdd914a26e056cf69207b4f50924ehkuang}
206291037db265ecdd914a26e056cf69207b4f50924ehkuang
2063b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void idct16_8col(__m128i *in) {
206491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
206591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
206691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
206791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
206891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
206991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
207091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
207191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
207291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
207391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
207491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
207591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
207691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
207791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
207891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
207991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
208091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
208191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
208291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
208391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
208491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
208591037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i v[16], u[16], s[16], t[16];
208691037db265ecdd914a26e056cf69207b4f50924ehkuang
208791037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 1
208891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[0] = in[0];
208991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[1] = in[8];
209091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[2] = in[4];
209191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[3] = in[12];
209291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[4] = in[2];
209391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[5] = in[10];
209491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[6] = in[6];
209591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[7] = in[14];
209691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[8] = in[1];
209791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[9] = in[9];
209891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[10] = in[5];
209991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[11] = in[13];
210091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[12] = in[3];
210191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[13] = in[11];
210291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[14] = in[7];
210391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[15] = in[15];
210491037db265ecdd914a26e056cf69207b4f50924ehkuang
210591037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 2
210691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(s[8], s[15]);
210791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(s[8], s[15]);
210891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(s[9], s[14]);
210991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(s[9], s[14]);
211091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_unpacklo_epi16(s[10], s[13]);
211191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_unpackhi_epi16(s[10], s[13]);
211291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_unpacklo_epi16(s[11], s[12]);
211391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_unpackhi_epi16(s[11], s[12]);
211491037db265ecdd914a26e056cf69207b4f50924ehkuang
211591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
211691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
211791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
211891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
211991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
212091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
212191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
212291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
212391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
212491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
212591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
212691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
212791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
212891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
212991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
213091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
213191037db265ecdd914a26e056cf69207b4f50924ehkuang
213291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
213391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
213491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
213591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
213691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
213791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
213891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
213991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
214091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
214191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
214291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
214391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
214491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
214591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
214691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
214791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
214891037db265ecdd914a26e056cf69207b4f50924ehkuang
214991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
215091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
215191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
215291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
215391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
215491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
215591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
215691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
215791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
215891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
215991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
216091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
216191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
216291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
216391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
216491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
216591037db265ecdd914a26e056cf69207b4f50924ehkuang
216691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[8]  = _mm_packs_epi32(u[0], u[1]);
216791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[15] = _mm_packs_epi32(u[2], u[3]);
216891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[9]  = _mm_packs_epi32(u[4], u[5]);
216991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[14] = _mm_packs_epi32(u[6], u[7]);
217091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[10] = _mm_packs_epi32(u[8], u[9]);
217191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[13] = _mm_packs_epi32(u[10], u[11]);
217291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[11] = _mm_packs_epi32(u[12], u[13]);
217391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[12] = _mm_packs_epi32(u[14], u[15]);
217491037db265ecdd914a26e056cf69207b4f50924ehkuang
217591037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 3
217691037db265ecdd914a26e056cf69207b4f50924ehkuang  t[0] = s[0];
217791037db265ecdd914a26e056cf69207b4f50924ehkuang  t[1] = s[1];
217891037db265ecdd914a26e056cf69207b4f50924ehkuang  t[2] = s[2];
217991037db265ecdd914a26e056cf69207b4f50924ehkuang  t[3] = s[3];
218091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(s[4], s[7]);
218191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(s[4], s[7]);
218291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(s[5], s[6]);
218391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(s[5], s[6]);
218491037db265ecdd914a26e056cf69207b4f50924ehkuang
218591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
218691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
218791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
218891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
218991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
219091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
219191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
219291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
219391037db265ecdd914a26e056cf69207b4f50924ehkuang
219491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
219591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
219691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
219791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
219891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
219991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
220091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
220191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
220291037db265ecdd914a26e056cf69207b4f50924ehkuang
220391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
220491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
220591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
220691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
220791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
220891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
220991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
221091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
221191037db265ecdd914a26e056cf69207b4f50924ehkuang
221291037db265ecdd914a26e056cf69207b4f50924ehkuang  t[4] = _mm_packs_epi32(u[0], u[1]);
221391037db265ecdd914a26e056cf69207b4f50924ehkuang  t[7] = _mm_packs_epi32(u[2], u[3]);
221491037db265ecdd914a26e056cf69207b4f50924ehkuang  t[5] = _mm_packs_epi32(u[4], u[5]);
221591037db265ecdd914a26e056cf69207b4f50924ehkuang  t[6] = _mm_packs_epi32(u[6], u[7]);
221691037db265ecdd914a26e056cf69207b4f50924ehkuang  t[8] = _mm_add_epi16(s[8], s[9]);
221791037db265ecdd914a26e056cf69207b4f50924ehkuang  t[9] = _mm_sub_epi16(s[8], s[9]);
221891037db265ecdd914a26e056cf69207b4f50924ehkuang  t[10] = _mm_sub_epi16(s[11], s[10]);
221991037db265ecdd914a26e056cf69207b4f50924ehkuang  t[11] = _mm_add_epi16(s[10], s[11]);
222091037db265ecdd914a26e056cf69207b4f50924ehkuang  t[12] = _mm_add_epi16(s[12], s[13]);
222191037db265ecdd914a26e056cf69207b4f50924ehkuang  t[13] = _mm_sub_epi16(s[12], s[13]);
222291037db265ecdd914a26e056cf69207b4f50924ehkuang  t[14] = _mm_sub_epi16(s[15], s[14]);
222391037db265ecdd914a26e056cf69207b4f50924ehkuang  t[15] = _mm_add_epi16(s[14], s[15]);
222491037db265ecdd914a26e056cf69207b4f50924ehkuang
222591037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 4
222691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(t[0], t[1]);
222791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(t[0], t[1]);
222891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(t[2], t[3]);
222991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(t[2], t[3]);
223091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_unpacklo_epi16(t[9], t[14]);
223191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_unpackhi_epi16(t[9], t[14]);
223291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_unpacklo_epi16(t[10], t[13]);
223391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_unpackhi_epi16(t[10], t[13]);
223491037db265ecdd914a26e056cf69207b4f50924ehkuang
223591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
223691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
223791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
223891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
223991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
224091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
224191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
224291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
224391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
224491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
224591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
224691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
224791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
224891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
224991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
225091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
225191037db265ecdd914a26e056cf69207b4f50924ehkuang
225291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
225391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
225491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
225591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
225691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
225791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
225891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
225991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
226091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
226191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
226291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
226391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
226491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
226591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
226691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
226791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
226891037db265ecdd914a26e056cf69207b4f50924ehkuang
226991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
227091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
227191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
227291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
227391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
227491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
227591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
227691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
227791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
227891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
227991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
228091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
228191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
228291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
228391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
228491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
228591037db265ecdd914a26e056cf69207b4f50924ehkuang
228691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[0] = _mm_packs_epi32(u[0], u[1]);
228791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[1] = _mm_packs_epi32(u[2], u[3]);
228891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[2] = _mm_packs_epi32(u[4], u[5]);
228991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[3] = _mm_packs_epi32(u[6], u[7]);
229091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[4] = _mm_add_epi16(t[4], t[5]);
229191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[5] = _mm_sub_epi16(t[4], t[5]);
229291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[6] = _mm_sub_epi16(t[7], t[6]);
229391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[7] = _mm_add_epi16(t[6], t[7]);
229491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[8] = t[8];
229591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[15] = t[15];
229691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[9]  = _mm_packs_epi32(u[8], u[9]);
229791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[14] = _mm_packs_epi32(u[10], u[11]);
229891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[10] = _mm_packs_epi32(u[12], u[13]);
229991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[13] = _mm_packs_epi32(u[14], u[15]);
230091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[11] = t[11];
230191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[12] = t[12];
230291037db265ecdd914a26e056cf69207b4f50924ehkuang
230391037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 5
230491037db265ecdd914a26e056cf69207b4f50924ehkuang  t[0] = _mm_add_epi16(s[0], s[3]);
230591037db265ecdd914a26e056cf69207b4f50924ehkuang  t[1] = _mm_add_epi16(s[1], s[2]);
230691037db265ecdd914a26e056cf69207b4f50924ehkuang  t[2] = _mm_sub_epi16(s[1], s[2]);
230791037db265ecdd914a26e056cf69207b4f50924ehkuang  t[3] = _mm_sub_epi16(s[0], s[3]);
230891037db265ecdd914a26e056cf69207b4f50924ehkuang  t[4] = s[4];
230991037db265ecdd914a26e056cf69207b4f50924ehkuang  t[7] = s[7];
231091037db265ecdd914a26e056cf69207b4f50924ehkuang
231191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(s[5], s[6]);
231291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(s[5], s[6]);
231391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
231491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
231591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
231691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
231791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
231891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
231991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
232091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
232191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
232291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
232391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
232491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
232591037db265ecdd914a26e056cf69207b4f50924ehkuang  t[5] = _mm_packs_epi32(u[0], u[1]);
232691037db265ecdd914a26e056cf69207b4f50924ehkuang  t[6] = _mm_packs_epi32(u[2], u[3]);
232791037db265ecdd914a26e056cf69207b4f50924ehkuang
232891037db265ecdd914a26e056cf69207b4f50924ehkuang  t[8] = _mm_add_epi16(s[8], s[11]);
232991037db265ecdd914a26e056cf69207b4f50924ehkuang  t[9] = _mm_add_epi16(s[9], s[10]);
233091037db265ecdd914a26e056cf69207b4f50924ehkuang  t[10] = _mm_sub_epi16(s[9], s[10]);
233191037db265ecdd914a26e056cf69207b4f50924ehkuang  t[11] = _mm_sub_epi16(s[8], s[11]);
233291037db265ecdd914a26e056cf69207b4f50924ehkuang  t[12] = _mm_sub_epi16(s[15], s[12]);
233391037db265ecdd914a26e056cf69207b4f50924ehkuang  t[13] = _mm_sub_epi16(s[14], s[13]);
233491037db265ecdd914a26e056cf69207b4f50924ehkuang  t[14] = _mm_add_epi16(s[13], s[14]);
233591037db265ecdd914a26e056cf69207b4f50924ehkuang  t[15] = _mm_add_epi16(s[12], s[15]);
233691037db265ecdd914a26e056cf69207b4f50924ehkuang
233791037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 6
233891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[0] = _mm_add_epi16(t[0], t[7]);
233991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[1] = _mm_add_epi16(t[1], t[6]);
234091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[2] = _mm_add_epi16(t[2], t[5]);
234191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[3] = _mm_add_epi16(t[3], t[4]);
234291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[4] = _mm_sub_epi16(t[3], t[4]);
234391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[5] = _mm_sub_epi16(t[2], t[5]);
234491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[6] = _mm_sub_epi16(t[1], t[6]);
234591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[7] = _mm_sub_epi16(t[0], t[7]);
234691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[8] = t[8];
234791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[9] = t[9];
234891037db265ecdd914a26e056cf69207b4f50924ehkuang
234991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(t[10], t[13]);
235091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(t[10], t[13]);
235191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(t[11], t[12]);
235291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(t[11], t[12]);
235391037db265ecdd914a26e056cf69207b4f50924ehkuang
235491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
235591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
235691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
235791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
235891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
235991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
236091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
236191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
236291037db265ecdd914a26e056cf69207b4f50924ehkuang
236391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
236491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
236591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
236691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
236791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
236891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
236991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
237091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
237191037db265ecdd914a26e056cf69207b4f50924ehkuang
237291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
237391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
237491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
237591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
237691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
237791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
237891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
237991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
238091037db265ecdd914a26e056cf69207b4f50924ehkuang
238191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[10] = _mm_packs_epi32(u[0], u[1]);
238291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[13] = _mm_packs_epi32(u[2], u[3]);
238391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[11] = _mm_packs_epi32(u[4], u[5]);
238491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[12] = _mm_packs_epi32(u[6], u[7]);
238591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[14] = t[14];
238691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[15] = t[15];
238791037db265ecdd914a26e056cf69207b4f50924ehkuang
238891037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 7
238991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_add_epi16(s[0], s[15]);
239091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_add_epi16(s[1], s[14]);
239191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_add_epi16(s[2], s[13]);
239291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_add_epi16(s[3], s[12]);
239391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = _mm_add_epi16(s[4], s[11]);
239491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_add_epi16(s[5], s[10]);
239591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = _mm_add_epi16(s[6], s[9]);
239691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_add_epi16(s[7], s[8]);
239791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[8] = _mm_sub_epi16(s[7], s[8]);
239891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[9] = _mm_sub_epi16(s[6], s[9]);
239991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[10] = _mm_sub_epi16(s[5], s[10]);
240091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[11] = _mm_sub_epi16(s[4], s[11]);
240191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[12] = _mm_sub_epi16(s[3], s[12]);
240291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[13] = _mm_sub_epi16(s[2], s[13]);
240391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[14] = _mm_sub_epi16(s[1], s[14]);
240491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[15] = _mm_sub_epi16(s[0], s[15]);
240591037db265ecdd914a26e056cf69207b4f50924ehkuang}
240691037db265ecdd914a26e056cf69207b4f50924ehkuang
2407b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void idct16_sse2(__m128i *in0, __m128i *in1) {
240891037db265ecdd914a26e056cf69207b4f50924ehkuang  array_transpose_16x16(in0, in1);
2409b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  idct16_8col(in0);
2410b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  idct16_8col(in1);
241191037db265ecdd914a26e056cf69207b4f50924ehkuang}
241291037db265ecdd914a26e056cf69207b4f50924ehkuang
2413b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void iadst16_sse2(__m128i *in0, __m128i *in1) {
241491037db265ecdd914a26e056cf69207b4f50924ehkuang  array_transpose_16x16(in0, in1);
2415b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  iadst16_8col(in0);
2416b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  iadst16_8col(in1);
241791037db265ecdd914a26e056cf69207b4f50924ehkuang}
241891037db265ecdd914a26e056cf69207b4f50924ehkuang
24195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
24205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * 16));
24215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * 16));
24225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * 16));
24235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * 16));
24245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * 16));
24255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * 16));
24265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * 16));
24275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * 16));
24285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
24295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[8]  = _mm_load_si128((const __m128i *)(input + 8 * 16));
24305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[9]  = _mm_load_si128((const __m128i *)(input + 9 * 16));
24315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[10]  = _mm_load_si128((const __m128i *)(input + 10 * 16));
24325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[11]  = _mm_load_si128((const __m128i *)(input + 11 * 16));
24335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[12]  = _mm_load_si128((const __m128i *)(input + 12 * 16));
24345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[13]  = _mm_load_si128((const __m128i *)(input + 13 * 16));
24355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[14]  = _mm_load_si128((const __m128i *)(input + 14 * 16));
24365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[15]  = _mm_load_si128((const __m128i *)(input + 15 * 16));
243791037db265ecdd914a26e056cf69207b4f50924ehkuang}
243891037db265ecdd914a26e056cf69207b4f50924ehkuang
243991037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
244091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i final_rounding = _mm_set1_epi16(1<<5);
244191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i zero = _mm_setzero_si128();
244291037db265ecdd914a26e056cf69207b4f50924ehkuang  // Final rounding and shift
244391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_adds_epi16(in[0], final_rounding);
244491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_adds_epi16(in[1], final_rounding);
244591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_adds_epi16(in[2], final_rounding);
244691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_adds_epi16(in[3], final_rounding);
244791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = _mm_adds_epi16(in[4], final_rounding);
244891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_adds_epi16(in[5], final_rounding);
244991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = _mm_adds_epi16(in[6], final_rounding);
245091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_adds_epi16(in[7], final_rounding);
245191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[8] = _mm_adds_epi16(in[8], final_rounding);
245291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[9] = _mm_adds_epi16(in[9], final_rounding);
245391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[10] = _mm_adds_epi16(in[10], final_rounding);
245491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[11] = _mm_adds_epi16(in[11], final_rounding);
245591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[12] = _mm_adds_epi16(in[12], final_rounding);
245691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[13] = _mm_adds_epi16(in[13], final_rounding);
245791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[14] = _mm_adds_epi16(in[14], final_rounding);
245891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[15] = _mm_adds_epi16(in[15], final_rounding);
245991037db265ecdd914a26e056cf69207b4f50924ehkuang
246091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_srai_epi16(in[0], 6);
246191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_srai_epi16(in[1], 6);
246291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_srai_epi16(in[2], 6);
246391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_srai_epi16(in[3], 6);
246491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = _mm_srai_epi16(in[4], 6);
246591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_srai_epi16(in[5], 6);
246691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = _mm_srai_epi16(in[6], 6);
246791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_srai_epi16(in[7], 6);
246891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[8] = _mm_srai_epi16(in[8], 6);
246991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[9] = _mm_srai_epi16(in[9], 6);
247091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[10] = _mm_srai_epi16(in[10], 6);
247191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[11] = _mm_srai_epi16(in[11], 6);
247291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[12] = _mm_srai_epi16(in[12], 6);
247391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[13] = _mm_srai_epi16(in[13], 6);
247491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[14] = _mm_srai_epi16(in[14], 6);
247591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[15] = _mm_srai_epi16(in[15], 6);
247691037db265ecdd914a26e056cf69207b4f50924ehkuang
247791037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[0]);
247891037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[1]);
247991037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[2]);
248091037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[3]);
248191037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[4]);
248291037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[5]);
248391037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[6]);
248491037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[7]);
248591037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[8]);
248691037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[9]);
248791037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[10]);
248891037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[11]);
248991037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[12]);
249091037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[13]);
249191037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[14]);
249291037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[15]);
249391037db265ecdd914a26e056cf69207b4f50924ehkuang}
249491037db265ecdd914a26e056cf69207b4f50924ehkuang
24955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
24965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                               int tx_type) {
249791037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i in0[16], in1[16];
249891037db265ecdd914a26e056cf69207b4f50924ehkuang
249991037db265ecdd914a26e056cf69207b4f50924ehkuang  load_buffer_8x16(input, in0);
250091037db265ecdd914a26e056cf69207b4f50924ehkuang  input += 8;
250191037db265ecdd914a26e056cf69207b4f50924ehkuang  load_buffer_8x16(input, in1);
250291037db265ecdd914a26e056cf69207b4f50924ehkuang
250391037db265ecdd914a26e056cf69207b4f50924ehkuang  switch (tx_type) {
250491037db265ecdd914a26e056cf69207b4f50924ehkuang    case 0:  // DCT_DCT
2505b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      idct16_sse2(in0, in1);
2506b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      idct16_sse2(in0, in1);
250791037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
250891037db265ecdd914a26e056cf69207b4f50924ehkuang    case 1:  // ADST_DCT
2509b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      idct16_sse2(in0, in1);
2510b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      iadst16_sse2(in0, in1);
251191037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
251291037db265ecdd914a26e056cf69207b4f50924ehkuang    case 2:  // DCT_ADST
2513b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      iadst16_sse2(in0, in1);
2514b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      idct16_sse2(in0, in1);
251591037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
251691037db265ecdd914a26e056cf69207b4f50924ehkuang    case 3:  // ADST_ADST
2517b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      iadst16_sse2(in0, in1);
2518b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      iadst16_sse2(in0, in1);
251991037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
252091037db265ecdd914a26e056cf69207b4f50924ehkuang    default:
252191037db265ecdd914a26e056cf69207b4f50924ehkuang      assert(0);
252291037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
252391037db265ecdd914a26e056cf69207b4f50924ehkuang  }
252491037db265ecdd914a26e056cf69207b4f50924ehkuang
252591037db265ecdd914a26e056cf69207b4f50924ehkuang  write_buffer_8x16(dest, in0, stride);
252691037db265ecdd914a26e056cf69207b4f50924ehkuang  dest += 8;
252791037db265ecdd914a26e056cf69207b4f50924ehkuang  write_buffer_8x16(dest, in1, stride);
252891037db265ecdd914a26e056cf69207b4f50924ehkuang}
252991037db265ecdd914a26e056cf69207b4f50924ehkuang
25305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
25315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                               int stride) {
2532ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
2533ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i final_rounding = _mm_set1_epi16(1<<5);
2534ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i zero = _mm_setzero_si128();
2535ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2536ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2537ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
2538ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2539ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
2540ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2541ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2542ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
2543ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2544ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
2545ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2546ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2547ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
2548ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
2549ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2550ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2551ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
2552b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i in[16], l[16];
2553b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6,
2554ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
2555ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_8_0, stp1_12_0;
2556ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
2557b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
2558ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2559ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int i;
2560b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // First 1-D inverse DCT
2561b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // Load input data.
2562b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[0] = _mm_load_si128((const __m128i *)input);
2563b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
2564b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
2565b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
2566ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2567b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
2568ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2569ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage2
2570ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {
2571b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
2572b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i lo_13_3 =  _mm_unpackhi_epi16(zero, in[1]);
2573ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2574ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
2575ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
2576ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
2577ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
2578ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2579ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding);
2580ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding);
2581ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp5 = _mm_add_epi32(tmp5, rounding);
2582ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp7 = _mm_add_epi32(tmp7, rounding);
2583ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2584ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2585ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2586ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2587ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2588ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2589b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp2_8  = _mm_packs_epi32(tmp0, tmp2);
2590b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp2_11 = _mm_packs_epi32(tmp5, tmp7);
2591ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
2592ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2593ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage3
2594ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {
2595b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
2596ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2597ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
2598ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
2599ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2600ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding);
2601ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding);
2602ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2603ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2604ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2605b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
2606b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
2607ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2608b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
2609ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
2610ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2611ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage4
2612ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {
2613b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
2614b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
2615b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
2616ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2617ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
2618ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
2619ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
2620ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
2621ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
2622ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
2623ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2624ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding);
2625ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding);
2626ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_add_epi32(tmp1, rounding);
2627ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_add_epi32(tmp3, rounding);
2628ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp5 = _mm_add_epi32(tmp5, rounding);
2629ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp7 = _mm_add_epi32(tmp7, rounding);
2630ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2631ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2632ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2633ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2634ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2635ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2636ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2637ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2638b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp1_0 = _mm_packs_epi32(tmp0, tmp0);
2639b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp1_1 = _mm_packs_epi32(tmp2, tmp2);
2640b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp2_9 = _mm_packs_epi32(tmp1, tmp3);
2641b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp2_10 = _mm_packs_epi32(tmp5, tmp7);
2642b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
2643b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
2644ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
2645ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2646ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage5 and Stage6
2647ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {
2648b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    tmp0 = _mm_add_epi16(stp2_8, stp2_11);
2649b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
2650b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    tmp2 = _mm_add_epi16(stp2_9, stp2_10);
2651b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
2652b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
2653b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp1_9  = _mm_unpacklo_epi64(tmp2, zero);
2654b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
2655b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp1_8  = _mm_unpacklo_epi64(tmp0, zero);
2656b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
2657b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
2658b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
2659b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
2660b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
2661b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
2662ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
2663ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2664ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage6
2665ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {
2666b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
2667ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
2668ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
2669ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2670ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
2671ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
2672ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
2673ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
2674ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
2675ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
2676ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2677ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_add_epi32(tmp1, rounding);
2678ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_add_epi32(tmp3, rounding);
2679ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding);
2680ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding);
2681ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_add_epi32(tmp4, rounding);
2682ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_add_epi32(tmp6, rounding);
2683ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2684ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2685ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2686ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2687ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2688ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2689ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2690ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2691b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp1_6 = _mm_packs_epi32(tmp3, tmp1);
2692b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
2693ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_10 = _mm_packs_epi32(tmp0, zero);
2694ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_13 = _mm_packs_epi32(tmp2, zero);
2695ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_11 = _mm_packs_epi32(tmp4, zero);
2696ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_12 = _mm_packs_epi32(tmp6, zero);
2697ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2698b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    tmp0 = _mm_add_epi16(stp1_0, stp1_4);
2699b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
2700b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    tmp2 = _mm_add_epi16(stp1_1, stp1_6);
2701b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
2702b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
2703b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
2704b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
2705b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
2706b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
2707b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
2708b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
2709b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
2710b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
2711ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
2712ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2713ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage7. Left 8x16 only.
2714b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  l[0] = _mm_add_epi16(stp2_0, stp1_15);
2715b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  l[1] = _mm_add_epi16(stp2_1, stp1_14);
2716b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  l[2] = _mm_add_epi16(stp2_2, stp2_13);
2717b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  l[3] = _mm_add_epi16(stp2_3, stp2_12);
2718b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  l[4] = _mm_add_epi16(stp2_4, stp2_11);
2719b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  l[5] = _mm_add_epi16(stp2_5, stp2_10);
2720b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  l[6] = _mm_add_epi16(stp2_6, stp1_9);
2721b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  l[7] = _mm_add_epi16(stp2_7, stp1_8);
2722b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  l[8] = _mm_sub_epi16(stp2_7, stp1_8);
2723b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  l[9] = _mm_sub_epi16(stp2_6, stp1_9);
2724b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  l[10] = _mm_sub_epi16(stp2_5, stp2_10);
2725b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  l[11] = _mm_sub_epi16(stp2_4, stp2_11);
2726b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  l[12] = _mm_sub_epi16(stp2_3, stp2_12);
2727b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  l[13] = _mm_sub_epi16(stp2_2, stp2_13);
2728b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  l[14] = _mm_sub_epi16(stp2_1, stp1_14);
2729b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  l[15] = _mm_sub_epi16(stp2_0, stp1_15);
2730b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
2731b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // Second 1-D inverse transform, performed per 8x16 block
2732ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 2; i++) {
2733b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    array_transpose_4X8(l + 8*i, in);
2734ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2735b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    IDCT16_10
2736ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2737ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // Stage7
2738b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[0] = _mm_add_epi16(stp2_0, stp1_15);
2739b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[1] = _mm_add_epi16(stp2_1, stp1_14);
2740b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[2] = _mm_add_epi16(stp2_2, stp2_13);
2741b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[3] = _mm_add_epi16(stp2_3, stp2_12);
2742b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[4] = _mm_add_epi16(stp2_4, stp2_11);
2743b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[5] = _mm_add_epi16(stp2_5, stp2_10);
2744b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[6] = _mm_add_epi16(stp2_6, stp1_9);
2745b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[7] = _mm_add_epi16(stp2_7, stp1_8);
2746b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
2747b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
2748b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
2749b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
2750b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
2751b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
2752b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
2753b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
2754ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2755ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // Final rounding and shift
2756b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[0] = _mm_adds_epi16(in[0], final_rounding);
2757b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[1] = _mm_adds_epi16(in[1], final_rounding);
2758b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[2] = _mm_adds_epi16(in[2], final_rounding);
2759b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[3] = _mm_adds_epi16(in[3], final_rounding);
2760b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[4] = _mm_adds_epi16(in[4], final_rounding);
2761b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[5] = _mm_adds_epi16(in[5], final_rounding);
2762b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[6] = _mm_adds_epi16(in[6], final_rounding);
2763b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[7] = _mm_adds_epi16(in[7], final_rounding);
2764b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[8] = _mm_adds_epi16(in[8], final_rounding);
2765b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[9] = _mm_adds_epi16(in[9], final_rounding);
2766b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[10] = _mm_adds_epi16(in[10], final_rounding);
2767b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[11] = _mm_adds_epi16(in[11], final_rounding);
2768b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[12] = _mm_adds_epi16(in[12], final_rounding);
2769b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[13] = _mm_adds_epi16(in[13], final_rounding);
2770b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[14] = _mm_adds_epi16(in[14], final_rounding);
2771b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[15] = _mm_adds_epi16(in[15], final_rounding);
2772b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
2773b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[0] = _mm_srai_epi16(in[0], 6);
2774b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[1] = _mm_srai_epi16(in[1], 6);
2775b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[2] = _mm_srai_epi16(in[2], 6);
2776b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[3] = _mm_srai_epi16(in[3], 6);
2777b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[4] = _mm_srai_epi16(in[4], 6);
2778b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[5] = _mm_srai_epi16(in[5], 6);
2779b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[6] = _mm_srai_epi16(in[6], 6);
2780b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[7] = _mm_srai_epi16(in[7], 6);
2781b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[8] = _mm_srai_epi16(in[8], 6);
2782b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[9] = _mm_srai_epi16(in[9], 6);
2783b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[10] = _mm_srai_epi16(in[10], 6);
2784b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[11] = _mm_srai_epi16(in[11], 6);
2785b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[12] = _mm_srai_epi16(in[12], 6);
2786b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[13] = _mm_srai_epi16(in[13], 6);
2787b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[14] = _mm_srai_epi16(in[14], 6);
2788b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in[15] = _mm_srai_epi16(in[15], 6);
2789b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
2790b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    RECON_AND_STORE(dest, in[0]);
2791b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    RECON_AND_STORE(dest, in[1]);
2792b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    RECON_AND_STORE(dest, in[2]);
2793b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    RECON_AND_STORE(dest, in[3]);
2794b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    RECON_AND_STORE(dest, in[4]);
2795b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    RECON_AND_STORE(dest, in[5]);
2796b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    RECON_AND_STORE(dest, in[6]);
2797b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    RECON_AND_STORE(dest, in[7]);
2798b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    RECON_AND_STORE(dest, in[8]);
2799b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    RECON_AND_STORE(dest, in[9]);
2800b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    RECON_AND_STORE(dest, in[10]);
2801b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    RECON_AND_STORE(dest, in[11]);
2802b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    RECON_AND_STORE(dest, in[12]);
2803b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    RECON_AND_STORE(dest, in[13]);
2804b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    RECON_AND_STORE(dest, in[14]);
2805b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    RECON_AND_STORE(dest, in[15]);
2806ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2807ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    dest += 8 - (stride * 16);
2808ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
2809ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
2810ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2811f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang#define LOAD_DQCOEFF(reg, input) \
2812f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  {  \
28135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    reg = _mm_load_si128((const __m128i *) input); \
2814f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    input += 8; \
2815f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  }  \
2816f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
2817b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#define IDCT32_34 \
2818b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian/* Stage1 */ \
2819b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian{ \
2820b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i zero = _mm_setzero_si128();\
2821b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
2822b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
2823b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2824b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
2825b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
2826b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2827b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
2828b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
2829b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2830b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
2831b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
2832b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2833b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
2834b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                         stg1_1, stp1_16, stp1_31); \
2835b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
2836b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                         stg1_7, stp1_19, stp1_28); \
2837b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
2838b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                         stg1_9, stp1_20, stp1_27); \
2839b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
2840b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                         stg1_15, stp1_23, stp1_24); \
2841b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} \
2842b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian\
2843b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian/* Stage2 */ \
2844b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian{ \
2845b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i zero = _mm_setzero_si128();\
2846b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
2847b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
2848b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2849b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
2850b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
2851b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2852b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
2853b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                         stg2_1, stp2_8, stp2_15); \
2854b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
2855b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                         stg2_7, stp2_11, stp2_12); \
2856b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2857b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_16 = stp1_16; \
2858b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_19 = stp1_19; \
2859b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2860b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_20 = stp1_20; \
2861b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_23 = stp1_23; \
2862b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2863b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_24 = stp1_24; \
2864b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_27 = stp1_27; \
2865b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2866b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_28 = stp1_28; \
2867b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_31 = stp1_31; \
2868b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} \
2869b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian\
2870b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian/* Stage3 */ \
2871b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian{ \
2872b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i zero = _mm_setzero_si128();\
2873b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
2874b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
2875b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2876b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
2877b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
2878b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
2879b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
2880b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2881b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
2882b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
2883b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
2884b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
2885b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2886b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
2887b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                         stg3_1, stp1_4, stp1_7); \
2888b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2889b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_8 = stp2_8; \
2890b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_11 = stp2_11; \
2891b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_12 = stp2_12; \
2892b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_15 = stp2_15; \
2893b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2894b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
2895b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
2896b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                         stp1_18, stp1_29) \
2897b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
2898b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
2899b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                         stp1_22, stp1_25) \
2900b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2901b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_16 = stp2_16; \
2902b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_31 = stp2_31; \
2903b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_19 = stp2_19; \
2904b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_20 = stp2_20; \
2905b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_23 = stp2_23; \
2906b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_24 = stp2_24; \
2907b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_27 = stp2_27; \
2908b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_28 = stp2_28; \
2909b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} \
2910b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian\
2911b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian/* Stage4 */ \
2912b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian{ \
2913b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i zero = _mm_setzero_si128();\
2914b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
2915b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
2916b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2917b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
2918b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
2919b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
2920b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
2921b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2922b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
2923b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                         stg4_1, stp2_0, stp2_1); \
2924b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2925b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_4 = stp1_4; \
2926b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_5 = stp1_4; \
2927b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_6 = stp1_7; \
2928b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_7 = stp1_7; \
2929b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2930b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
2931b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
2932b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                         stp2_10, stp2_13) \
2933b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2934b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_8 = stp1_8; \
2935b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_15 = stp1_15; \
2936b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_11 = stp1_11; \
2937b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_12 = stp1_12; \
2938b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2939b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
2940b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
2941b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
2942b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
2943b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
2944b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
2945b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
2946b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
2947b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2948b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
2949b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
2950b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
2951b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
2952b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
2953b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
2954b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
2955b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
2956b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} \
2957b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian\
2958b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian/* Stage5 */ \
2959b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian{ \
2960b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
2961b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
2962b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2963b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2964b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2965b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
2966b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
2967b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2968b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2969b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2970b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2971b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2972b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2973b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_0 = stp2_0; \
2974b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_1 = stp2_1; \
2975b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_2 = stp2_1; \
2976b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_3 = stp2_0; \
2977b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2978b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
2979b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
2980b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
2981b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
2982b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2983b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  tmp0 = _mm_add_epi32(tmp0, rounding); \
2984b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  tmp1 = _mm_add_epi32(tmp1, rounding); \
2985b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  tmp2 = _mm_add_epi32(tmp2, rounding); \
2986b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  tmp3 = _mm_add_epi32(tmp3, rounding); \
2987b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2988b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
2989b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
2990b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
2991b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
2992b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2993b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
2994b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
2995b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2996b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_4 = stp2_4; \
2997b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_7 = stp2_7; \
2998b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
2999b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
3000b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
3001b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
3002b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
3003b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
3004b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
3005b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
3006b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
3007b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
3008b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_16 = stp2_16; \
3009b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_17 = stp2_17; \
3010b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
3011b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
3012b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
3013b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                         stp1_19, stp1_28) \
3014b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
3015b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
3016b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                         stp1_21, stp1_26) \
3017b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
3018b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_22 = stp2_22; \
3019b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_23 = stp2_23; \
3020b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_24 = stp2_24; \
3021b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_25 = stp2_25; \
3022b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_30 = stp2_30; \
3023b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_31 = stp2_31; \
3024b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} \
3025b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian\
3026b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian/* Stage6 */ \
3027b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian{ \
3028b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
3029b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
3030b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
3031b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
3032b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
3033b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
3034b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
3035b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
3036b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
3037b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
3038b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
3039b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
3040b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
3041b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
3042b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_8 = stp1_8; \
3043b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_9 = stp1_9; \
3044b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_14 = stp1_14; \
3045b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_15 = stp1_15; \
3046b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
3047b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
3048b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
3049b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                         stp2_13, stp2_11, stp2_12) \
3050b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
3051b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
3052b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
3053b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
3054b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
3055b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
3056b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
3057b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
3058b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
3059b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
3060b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
3061b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
3062b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
3063b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
3064b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
3065b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
3066b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
3067b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
3068b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} \
3069b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian\
3070b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian/* Stage7 */ \
3071b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian{ \
3072b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
3073b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
3074b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
3075b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
3076b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
3077b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
3078b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
3079b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
3080b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
3081b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
3082b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
3083b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
3084b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
3085b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
3086b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
3087b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
3088b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
3089b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
3090b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
3091b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
3092b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
3093b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
3094b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
3095b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
3096b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
3097b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
3098b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
3099b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_16 = stp2_16; \
3100b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_17 = stp2_17; \
3101b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_18 = stp2_18; \
3102b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_19 = stp2_19; \
3103b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
3104b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
3105b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
3106b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                         stp1_21, stp1_26) \
3107b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
3108b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
3109b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                         stp1_23, stp1_24) \
3110b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
3111b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_28 = stp2_28; \
3112b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_29 = stp2_29; \
3113b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_30 = stp2_30; \
3114b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  stp1_31 = stp2_31; \
3115b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}
3116b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
3117b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
3118b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#define IDCT32 \
31195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/* Stage1 */ \
31205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang{ \
3121b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
3122b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
3123b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
3124b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
3125b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
3126b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
3127b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
3128b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
3129b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
3130b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
3131b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
3132b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
3133b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
3134b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
3135b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  \
3136b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
3137b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
3138b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
3139b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
31405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
31415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
31425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
31435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_17, stp1_30) \
31445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
31455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
31465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_19, stp1_28) \
31475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
31485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
31495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_21, stp1_26) \
31505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
31515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
31525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_23, stp1_24) \
31535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} \
31545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang\
31555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/* Stage2 */ \
31565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang{ \
3157b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
3158b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
3159b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
3160b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
31615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
3162b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
3163b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
3164b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
3165b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
31665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
31675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
31685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
31695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp2_14) \
31705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
31715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
31725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp2_11, stp2_12) \
31735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
31745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
31755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
31765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
31775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
31785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
31795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
31805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
31815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
31825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
31835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
31845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
31855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
31865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
31875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
31885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
31895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
31905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
31915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
31925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
31935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} \
31945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang\
31955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/* Stage3 */ \
31965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang{ \
3197b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
3198b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
3199b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
3200b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
32015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
32025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
32035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
32045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
32055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
32065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
32075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
32085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
32095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
32105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
32115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
32125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
32135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
32145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_6) \
32155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
32165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
32175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
32185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
32195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
32205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
32215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
32225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
32235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
32245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
32255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
32265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
32275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_18, stp1_29) \
32285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
32295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
32305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_22, stp1_25) \
32315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
32325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_16 = stp2_16; \
32335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_31 = stp2_31; \
32345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_19 = stp2_19; \
32355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_20 = stp2_20; \
32365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_23 = stp2_23; \
32375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_24 = stp2_24; \
32385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_27 = stp2_27; \
32395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_28 = stp2_28; \
32405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} \
32415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang\
32425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/* Stage4 */ \
32435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang{ \
3244b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
3245b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
3246b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
3247b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
32485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
32495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
32505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
32515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
32525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
32535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
32545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
32555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
32565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp2_2, stp2_3) \
32575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
32585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
32595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
32605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
32615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
32625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
32635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
32645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
32655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp2_10, stp2_13) \
32665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
32675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_8 = stp1_8; \
32685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_15 = stp1_15; \
32695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_11 = stp1_11; \
32705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_12 = stp1_12; \
32715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
32725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
32735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
32745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
32755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
32765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
32775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
32785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
32795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
32805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
32815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
32825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
32835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
32845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
32855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
32865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
32875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
32885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
32895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} \
32905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang\
32915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/* Stage5 */ \
32925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang{ \
32935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
32945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
32955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
32965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
32975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
32985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
32995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
33005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
33015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
33025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
33035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
33045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
33055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
33065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
33075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
33085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
33095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
33105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
33115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
33125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
33135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
33145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
33155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
33165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp0 = _mm_add_epi32(tmp0, rounding); \
33175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp1 = _mm_add_epi32(tmp1, rounding); \
33185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp2 = _mm_add_epi32(tmp2, rounding); \
33195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp3 = _mm_add_epi32(tmp3, rounding); \
33205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
33215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
33225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
33235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
33245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
33255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
33265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
33275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
33285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
33295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_4 = stp2_4; \
33305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_7 = stp2_7; \
33315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
33325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
33335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
33345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
33355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
33365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
33375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
33385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
33395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
33405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
33415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_16 = stp2_16; \
33425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_17 = stp2_17; \
33435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
33445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
33455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
33465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_19, stp1_28) \
33475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
33485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
33495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_21, stp1_26) \
33505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
33515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_22 = stp2_22; \
33525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_23 = stp2_23; \
33535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_24 = stp2_24; \
33545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_25 = stp2_25; \
33555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_30 = stp2_30; \
33565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_31 = stp2_31; \
33575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} \
33585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang\
33595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/* Stage6 */ \
33605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang{ \
33615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
33625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
33635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
33645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
33655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
33665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
33675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
33685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
33695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
33705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
33715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
33725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
33735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
33745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
33755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_8 = stp1_8; \
33765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_9 = stp1_9; \
33775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_14 = stp1_14; \
33785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_15 = stp1_15; \
33795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
33805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
33815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
33825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp2_13, stp2_11, stp2_12) \
33835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
33845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
33855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
33865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
33875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
33885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
33895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
33905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
33915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
33925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
33935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
33945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
33955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
33965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
33975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
33985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
33995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
34005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
34015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} \
34025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang\
34035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/* Stage7 */ \
34045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang{ \
34055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
34065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
34075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
34085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
34095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
34105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
34115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
34125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
34135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
34145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
34155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
34165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
34175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
34185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
34195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
34205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
34215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
34225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
34235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
34245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
34255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
34265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
34275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
34285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
34295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
34305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
34315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
34325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_16 = stp2_16; \
34335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_17 = stp2_17; \
34345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_18 = stp2_18; \
34355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_19 = stp2_19; \
34365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
34375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
34385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
34395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_21, stp1_26) \
34405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
34415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
34425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_23, stp1_24) \
34435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
34445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_28 = stp2_28; \
34455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_29 = stp2_29; \
34465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_30 = stp2_30; \
34475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_31 = stp2_31; \
34485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
34495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
34505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang// Only upper-left 8x8 has non-zero coeff
34515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
34525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                 int stride) {
34535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
34545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i final_rounding = _mm_set1_epi16(1<<5);
34555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
34565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  // idct constants for each stage
34575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
34585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
34595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
34605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
34615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
34625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
34635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
34645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
34655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
34665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
34675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
34685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
34695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
34705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
34715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
34725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
34735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
34745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
34755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
34765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
34775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
34785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
34795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
34805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
34815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
34825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
34835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
34845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
34855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
34865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
34875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
34885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
34895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
34905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
34915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
34925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
34935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
34945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
34955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
34965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
34975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
34985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
34995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
35005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
35015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
35025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
35035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3504b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i in[32], col[32];
35055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
35065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
35075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
35085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
35095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          stp1_30, stp1_31;
35105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
35115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
35125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
35135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
35145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          stp2_30, stp2_31;
35155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3516b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  int i;
3517b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // Load input data.
3518b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[0], input);
3519b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[8], input);
3520b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[16], input);
3521b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[24], input);
3522b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[1], input);
3523b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[9], input);
3524b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[17], input);
3525b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[25], input);
3526b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[2], input);
3527b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[10], input);
3528b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[18], input);
3529b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[26], input);
3530b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[3], input);
3531b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[11], input);
3532b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[19], input);
3533b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[27], input);
3534b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
3535b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[4], input);
3536b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[12], input);
3537b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[20], input);
3538b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[28], input);
3539b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[5], input);
3540b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[13], input);
3541b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[21], input);
3542b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[29], input);
3543b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[6], input);
3544b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[14], input);
3545b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[22], input);
3546b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[30], input);
3547b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[7], input);
3548b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[15], input);
3549b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[23], input);
3550b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  LOAD_DQCOEFF(in[31], input);
35515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3552b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  array_transpose_8x8(in, in);
3553b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  array_transpose_8x8(in+8, in+8);
3554b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  array_transpose_8x8(in+16, in+16);
3555b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  array_transpose_8x8(in+24, in+24);
3556b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
3557b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  IDCT32
3558b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
3559b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // 1_D: Store 32 intermediate results for each 8x32 block.
3560b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[0] = _mm_add_epi16(stp1_0, stp1_31);
3561b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[1] = _mm_add_epi16(stp1_1, stp1_30);
3562b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[2] = _mm_add_epi16(stp1_2, stp1_29);
3563b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[3] = _mm_add_epi16(stp1_3, stp1_28);
3564b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[4] = _mm_add_epi16(stp1_4, stp1_27);
3565b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[5] = _mm_add_epi16(stp1_5, stp1_26);
3566b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[6] = _mm_add_epi16(stp1_6, stp1_25);
3567b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[7] = _mm_add_epi16(stp1_7, stp1_24);
3568b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[8] = _mm_add_epi16(stp1_8, stp1_23);
3569b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[9] = _mm_add_epi16(stp1_9, stp1_22);
3570b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[10] = _mm_add_epi16(stp1_10, stp1_21);
3571b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[11] = _mm_add_epi16(stp1_11, stp1_20);
3572b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[12] = _mm_add_epi16(stp1_12, stp1_19);
3573b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[13] = _mm_add_epi16(stp1_13, stp1_18);
3574b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[14] = _mm_add_epi16(stp1_14, stp1_17);
3575b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[15] = _mm_add_epi16(stp1_15, stp1_16);
3576b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[16] = _mm_sub_epi16(stp1_15, stp1_16);
3577b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[17] = _mm_sub_epi16(stp1_14, stp1_17);
3578b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[18] = _mm_sub_epi16(stp1_13, stp1_18);
3579b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[19] = _mm_sub_epi16(stp1_12, stp1_19);
3580b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[20] = _mm_sub_epi16(stp1_11, stp1_20);
3581b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[21] = _mm_sub_epi16(stp1_10, stp1_21);
3582b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[22] = _mm_sub_epi16(stp1_9, stp1_22);
3583b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[23] = _mm_sub_epi16(stp1_8, stp1_23);
3584b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[24] = _mm_sub_epi16(stp1_7, stp1_24);
3585b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[25] = _mm_sub_epi16(stp1_6, stp1_25);
3586b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[26] = _mm_sub_epi16(stp1_5, stp1_26);
3587b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[27] = _mm_sub_epi16(stp1_4, stp1_27);
3588b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[28] = _mm_sub_epi16(stp1_3, stp1_28);
3589b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[29] = _mm_sub_epi16(stp1_2, stp1_29);
3590b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[30] = _mm_sub_epi16(stp1_1, stp1_30);
3591b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  col[31] = _mm_sub_epi16(stp1_0, stp1_31);
3592b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  for (i = 0; i < 4; i++) {
35935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      const __m128i zero = _mm_setzero_si128();
3594b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // Transpose 32x8 block to 8x32 block
3595b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      array_transpose_8x8(col+i*8, in);
3596b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      IDCT32_34
35975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
35985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      // 2_D: Calculate the results and store them to destination.
3599b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[0] = _mm_add_epi16(stp1_0, stp1_31);
3600b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[1] = _mm_add_epi16(stp1_1, stp1_30);
3601b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[2] = _mm_add_epi16(stp1_2, stp1_29);
3602b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[3] = _mm_add_epi16(stp1_3, stp1_28);
3603b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[4] = _mm_add_epi16(stp1_4, stp1_27);
3604b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[5] = _mm_add_epi16(stp1_5, stp1_26);
3605b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[6] = _mm_add_epi16(stp1_6, stp1_25);
3606b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[7] = _mm_add_epi16(stp1_7, stp1_24);
3607b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[8] = _mm_add_epi16(stp1_8, stp1_23);
3608b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[9] = _mm_add_epi16(stp1_9, stp1_22);
3609b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[10] = _mm_add_epi16(stp1_10, stp1_21);
3610b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[11] = _mm_add_epi16(stp1_11, stp1_20);
3611b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[12] = _mm_add_epi16(stp1_12, stp1_19);
3612b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[13] = _mm_add_epi16(stp1_13, stp1_18);
3613b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[14] = _mm_add_epi16(stp1_14, stp1_17);
3614b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[15] = _mm_add_epi16(stp1_15, stp1_16);
3615b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[16] = _mm_sub_epi16(stp1_15, stp1_16);
3616b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[17] = _mm_sub_epi16(stp1_14, stp1_17);
3617b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[18] = _mm_sub_epi16(stp1_13, stp1_18);
3618b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[19] = _mm_sub_epi16(stp1_12, stp1_19);
3619b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[20] = _mm_sub_epi16(stp1_11, stp1_20);
3620b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[21] = _mm_sub_epi16(stp1_10, stp1_21);
3621b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[22] = _mm_sub_epi16(stp1_9, stp1_22);
3622b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[23] = _mm_sub_epi16(stp1_8, stp1_23);
3623b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[24] = _mm_sub_epi16(stp1_7, stp1_24);
3624b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[25] = _mm_sub_epi16(stp1_6, stp1_25);
3625b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[26] = _mm_sub_epi16(stp1_5, stp1_26);
3626b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[27] = _mm_sub_epi16(stp1_4, stp1_27);
3627b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[28] = _mm_sub_epi16(stp1_3, stp1_28);
3628b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[29] = _mm_sub_epi16(stp1_2, stp1_29);
3629b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[30] = _mm_sub_epi16(stp1_1, stp1_30);
3630b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[31] = _mm_sub_epi16(stp1_0, stp1_31);
36315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
36325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      // Final rounding and shift
3633b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[0] = _mm_adds_epi16(in[0], final_rounding);
3634b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[1] = _mm_adds_epi16(in[1], final_rounding);
3635b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[2] = _mm_adds_epi16(in[2], final_rounding);
3636b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[3] = _mm_adds_epi16(in[3], final_rounding);
3637b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[4] = _mm_adds_epi16(in[4], final_rounding);
3638b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[5] = _mm_adds_epi16(in[5], final_rounding);
3639b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[6] = _mm_adds_epi16(in[6], final_rounding);
3640b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[7] = _mm_adds_epi16(in[7], final_rounding);
3641b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[8] = _mm_adds_epi16(in[8], final_rounding);
3642b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[9] = _mm_adds_epi16(in[9], final_rounding);
3643b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[10] = _mm_adds_epi16(in[10], final_rounding);
3644b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[11] = _mm_adds_epi16(in[11], final_rounding);
3645b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[12] = _mm_adds_epi16(in[12], final_rounding);
3646b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[13] = _mm_adds_epi16(in[13], final_rounding);
3647b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[14] = _mm_adds_epi16(in[14], final_rounding);
3648b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[15] = _mm_adds_epi16(in[15], final_rounding);
3649b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[16] = _mm_adds_epi16(in[16], final_rounding);
3650b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[17] = _mm_adds_epi16(in[17], final_rounding);
3651b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[18] = _mm_adds_epi16(in[18], final_rounding);
3652b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[19] = _mm_adds_epi16(in[19], final_rounding);
3653b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[20] = _mm_adds_epi16(in[20], final_rounding);
3654b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[21] = _mm_adds_epi16(in[21], final_rounding);
3655b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[22] = _mm_adds_epi16(in[22], final_rounding);
3656b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[23] = _mm_adds_epi16(in[23], final_rounding);
3657b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[24] = _mm_adds_epi16(in[24], final_rounding);
3658b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[25] = _mm_adds_epi16(in[25], final_rounding);
3659b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[26] = _mm_adds_epi16(in[26], final_rounding);
3660b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[27] = _mm_adds_epi16(in[27], final_rounding);
3661b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[28] = _mm_adds_epi16(in[28], final_rounding);
3662b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[29] = _mm_adds_epi16(in[29], final_rounding);
3663b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[30] = _mm_adds_epi16(in[30], final_rounding);
3664b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[31] = _mm_adds_epi16(in[31], final_rounding);
3665b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
3666b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[0] = _mm_srai_epi16(in[0], 6);
3667b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[1] = _mm_srai_epi16(in[1], 6);
3668b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[2] = _mm_srai_epi16(in[2], 6);
3669b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[3] = _mm_srai_epi16(in[3], 6);
3670b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[4] = _mm_srai_epi16(in[4], 6);
3671b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[5] = _mm_srai_epi16(in[5], 6);
3672b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[6] = _mm_srai_epi16(in[6], 6);
3673b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[7] = _mm_srai_epi16(in[7], 6);
3674b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[8] = _mm_srai_epi16(in[8], 6);
3675b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[9] = _mm_srai_epi16(in[9], 6);
3676b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[10] = _mm_srai_epi16(in[10], 6);
3677b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[11] = _mm_srai_epi16(in[11], 6);
3678b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[12] = _mm_srai_epi16(in[12], 6);
3679b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[13] = _mm_srai_epi16(in[13], 6);
3680b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[14] = _mm_srai_epi16(in[14], 6);
3681b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[15] = _mm_srai_epi16(in[15], 6);
3682b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[16] = _mm_srai_epi16(in[16], 6);
3683b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[17] = _mm_srai_epi16(in[17], 6);
3684b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[18] = _mm_srai_epi16(in[18], 6);
3685b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[19] = _mm_srai_epi16(in[19], 6);
3686b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[20] = _mm_srai_epi16(in[20], 6);
3687b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[21] = _mm_srai_epi16(in[21], 6);
3688b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[22] = _mm_srai_epi16(in[22], 6);
3689b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[23] = _mm_srai_epi16(in[23], 6);
3690b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[24] = _mm_srai_epi16(in[24], 6);
3691b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[25] = _mm_srai_epi16(in[25], 6);
3692b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[26] = _mm_srai_epi16(in[26], 6);
3693b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[27] = _mm_srai_epi16(in[27], 6);
3694b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[28] = _mm_srai_epi16(in[28], 6);
3695b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[29] = _mm_srai_epi16(in[29], 6);
3696b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[30] = _mm_srai_epi16(in[30], 6);
3697b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[31] = _mm_srai_epi16(in[31], 6);
3698b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
3699b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[0]);
3700b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[1]);
3701b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[2]);
3702b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[3]);
3703b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[4]);
3704b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[5]);
3705b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[6]);
3706b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[7]);
3707b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[8]);
3708b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[9]);
3709b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[10]);
3710b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[11]);
3711b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[12]);
3712b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[13]);
3713b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[14]);
3714b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[15]);
3715b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[16]);
3716b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[17]);
3717b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[18]);
3718b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[19]);
3719b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[20]);
3720b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[21]);
3721b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[22]);
3722b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[23]);
3723b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[24]);
3724b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[25]);
3725b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[26]);
3726b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[27]);
3727b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[28]);
3728b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[29]);
3729b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[30]);
3730b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[31]);
37315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
37325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dest += 8 - (stride * 32);
37335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
37345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
37355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
37365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
37375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                 int stride) {
3738ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3739ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i final_rounding = _mm_set1_epi16(1<<5);
3740ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3741ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // idct constants for each stage
3742ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3743ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3744ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
3745ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
3746ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
3747ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
3748ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3749ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3750ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3751ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3752ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
3753ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
3754ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
3755ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
3756ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3757ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3758ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3759ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3760ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3761ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
3762ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
3763ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
3764ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
3765ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3766ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3767ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3768ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3769ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3770ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
3771ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
3772ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3773ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3774ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3775ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3776ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3777ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3778ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3779ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3780ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3781ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
3782ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
3783ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3784ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3785ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3786ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3787ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3788ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3789b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i in[32], col[128], zero_idx[16];
3790ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3791ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3792ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3793ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3794ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_30, stp1_31;
3795ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3796ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3797ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3798ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3799ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp2_30, stp2_31;
3800ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3801f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  int i, j, i32;
3802f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  int zero_flag[2];
3803ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3804b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  for (i = 0; i < 4; i++) {
3805f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    i32 = (i << 5);
3806ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // First 1-D idct
3807ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Load input data.
3808b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[0], input);
3809b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[8], input);
3810b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[16], input);
3811b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[24], input);
3812b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[1], input);
3813b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[9], input);
3814b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[17], input);
3815b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[25], input);
3816b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[2], input);
3817b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[10], input);
3818b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[18], input);
3819b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[26], input);
3820b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[3], input);
3821b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[11], input);
3822b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[19], input);
3823b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[27], input);
3824b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
3825b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[4], input);
3826b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[12], input);
3827b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[20], input);
3828b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[28], input);
3829b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[5], input);
3830b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[13], input);
3831b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[21], input);
3832b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[29], input);
3833b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[6], input);
3834b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[14], input);
3835b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[22], input);
3836b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[30], input);
3837b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[7], input);
3838b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[15], input);
3839b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[23], input);
3840b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      LOAD_DQCOEFF(in[31], input);
3841f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
3842f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      // checking if all entries are zero
3843b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      zero_idx[0] = _mm_or_si128(in[0], in[1]);
3844b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      zero_idx[1] = _mm_or_si128(in[2], in[3]);
3845b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      zero_idx[2] = _mm_or_si128(in[4], in[5]);
3846b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      zero_idx[3] = _mm_or_si128(in[6], in[7]);
3847b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      zero_idx[4] = _mm_or_si128(in[8], in[9]);
3848b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      zero_idx[5] = _mm_or_si128(in[10], in[11]);
3849b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      zero_idx[6] = _mm_or_si128(in[12], in[13]);
3850b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      zero_idx[7] = _mm_or_si128(in[14], in[15]);
3851b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      zero_idx[8] = _mm_or_si128(in[16], in[17]);
3852b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      zero_idx[9] = _mm_or_si128(in[18], in[19]);
3853b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      zero_idx[10] = _mm_or_si128(in[20], in[21]);
3854b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      zero_idx[11] = _mm_or_si128(in[22], in[23]);
3855b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      zero_idx[12] = _mm_or_si128(in[24], in[25]);
3856b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      zero_idx[13] = _mm_or_si128(in[26], in[27]);
3857b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      zero_idx[14] = _mm_or_si128(in[28], in[29]);
3858b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      zero_idx[15] = _mm_or_si128(in[30], in[31]);
3859f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
3860f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3861f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3862f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3863f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3864f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3865f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3866f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3867f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
3868f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
3869f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3870f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3871f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3872f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3873f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3874f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3875f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3876f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
3877f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[0] = _mm_unpackhi_epi64(zero_idx[14], zero_idx[14]);
3878f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[1] = _mm_or_si128(zero_idx[0], zero_idx[14]);
3879f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[2] = _mm_srli_epi64(zero_idx[1], 32);
3880f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_flag[0] = _mm_cvtsi128_si32(zero_idx[1]);
3881f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_flag[1] = _mm_cvtsi128_si32(zero_idx[2]);
3882f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
3883f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      if (!zero_flag[0] && !zero_flag[1]) {
3884f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 0] = _mm_setzero_si128();
3885f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 1] = _mm_setzero_si128();
3886f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 2] = _mm_setzero_si128();
3887f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 3] = _mm_setzero_si128();
3888f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 4] = _mm_setzero_si128();
3889f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 5] = _mm_setzero_si128();
3890f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 6] = _mm_setzero_si128();
3891f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 7] = _mm_setzero_si128();
3892f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 8] = _mm_setzero_si128();
3893f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 9] = _mm_setzero_si128();
3894f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 10] = _mm_setzero_si128();
3895f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 11] = _mm_setzero_si128();
3896f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 12] = _mm_setzero_si128();
3897f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 13] = _mm_setzero_si128();
3898f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 14] = _mm_setzero_si128();
3899f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 15] = _mm_setzero_si128();
3900f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 16] = _mm_setzero_si128();
3901f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 17] = _mm_setzero_si128();
3902f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 18] = _mm_setzero_si128();
3903f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 19] = _mm_setzero_si128();
3904f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 20] = _mm_setzero_si128();
3905f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 21] = _mm_setzero_si128();
3906f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 22] = _mm_setzero_si128();
3907f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 23] = _mm_setzero_si128();
3908f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 24] = _mm_setzero_si128();
3909f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 25] = _mm_setzero_si128();
3910f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 26] = _mm_setzero_si128();
3911f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 27] = _mm_setzero_si128();
3912f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 28] = _mm_setzero_si128();
3913f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 29] = _mm_setzero_si128();
3914f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 30] = _mm_setzero_si128();
3915f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 31] = _mm_setzero_si128();
3916f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        continue;
3917f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      }
3918ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3919ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Transpose 32x8 block to 8x32 block
3920b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      array_transpose_8x8(in, in);
3921b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      array_transpose_8x8(in+8, in+8);
3922b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      array_transpose_8x8(in+16, in+16);
3923b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      array_transpose_8x8(in+24, in+24);
3924ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3925b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      IDCT32
3926ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3927ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // 1_D: Store 32 intermediate results for each 8x32 block.
3928f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
3929f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
3930f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
3931f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
3932f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
3933f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
3934f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
3935f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
3936f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
3937f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
3938f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
3939f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
3940f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
3941f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
3942f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
3943f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
3944f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
3945f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
3946f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
3947f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
3948f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
3949f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
3950f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
3951f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
3952f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
3953f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
3954f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
3955f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
3956f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
3957f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
3958f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
3959f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
3960b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    }
3961b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  for (i = 0; i < 4; i++) {
3962ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i zero = _mm_setzero_si128();
3963b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // Second 1-D idct
3964b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      j = i << 3;
3965b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
3966b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // Transpose 32x8 block to 8x32 block
3967b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      array_transpose_8x8(col+j, in);
3968b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      array_transpose_8x8(col+j+32, in+8);
3969b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      array_transpose_8x8(col+j+64, in+16);
3970b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      array_transpose_8x8(col+j+96, in+24);
3971b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
3972b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      IDCT32
3973ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3974ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // 2_D: Calculate the results and store them to destination.
3975b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[0] = _mm_add_epi16(stp1_0, stp1_31);
3976b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[1] = _mm_add_epi16(stp1_1, stp1_30);
3977b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[2] = _mm_add_epi16(stp1_2, stp1_29);
3978b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[3] = _mm_add_epi16(stp1_3, stp1_28);
3979b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[4] = _mm_add_epi16(stp1_4, stp1_27);
3980b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[5] = _mm_add_epi16(stp1_5, stp1_26);
3981b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[6] = _mm_add_epi16(stp1_6, stp1_25);
3982b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[7] = _mm_add_epi16(stp1_7, stp1_24);
3983b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[8] = _mm_add_epi16(stp1_8, stp1_23);
3984b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[9] = _mm_add_epi16(stp1_9, stp1_22);
3985b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[10] = _mm_add_epi16(stp1_10, stp1_21);
3986b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[11] = _mm_add_epi16(stp1_11, stp1_20);
3987b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[12] = _mm_add_epi16(stp1_12, stp1_19);
3988b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[13] = _mm_add_epi16(stp1_13, stp1_18);
3989b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[14] = _mm_add_epi16(stp1_14, stp1_17);
3990b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[15] = _mm_add_epi16(stp1_15, stp1_16);
3991b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[16] = _mm_sub_epi16(stp1_15, stp1_16);
3992b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[17] = _mm_sub_epi16(stp1_14, stp1_17);
3993b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[18] = _mm_sub_epi16(stp1_13, stp1_18);
3994b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[19] = _mm_sub_epi16(stp1_12, stp1_19);
3995b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[20] = _mm_sub_epi16(stp1_11, stp1_20);
3996b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[21] = _mm_sub_epi16(stp1_10, stp1_21);
3997b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[22] = _mm_sub_epi16(stp1_9, stp1_22);
3998b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[23] = _mm_sub_epi16(stp1_8, stp1_23);
3999b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[24] = _mm_sub_epi16(stp1_7, stp1_24);
4000b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[25] = _mm_sub_epi16(stp1_6, stp1_25);
4001b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[26] = _mm_sub_epi16(stp1_5, stp1_26);
4002b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[27] = _mm_sub_epi16(stp1_4, stp1_27);
4003b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[28] = _mm_sub_epi16(stp1_3, stp1_28);
4004b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[29] = _mm_sub_epi16(stp1_2, stp1_29);
4005b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[30] = _mm_sub_epi16(stp1_1, stp1_30);
4006b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[31] = _mm_sub_epi16(stp1_0, stp1_31);
4007ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
4008ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Final rounding and shift
4009b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[0] = _mm_adds_epi16(in[0], final_rounding);
4010b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[1] = _mm_adds_epi16(in[1], final_rounding);
4011b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[2] = _mm_adds_epi16(in[2], final_rounding);
4012b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[3] = _mm_adds_epi16(in[3], final_rounding);
4013b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[4] = _mm_adds_epi16(in[4], final_rounding);
4014b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[5] = _mm_adds_epi16(in[5], final_rounding);
4015b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[6] = _mm_adds_epi16(in[6], final_rounding);
4016b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[7] = _mm_adds_epi16(in[7], final_rounding);
4017b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[8] = _mm_adds_epi16(in[8], final_rounding);
4018b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[9] = _mm_adds_epi16(in[9], final_rounding);
4019b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[10] = _mm_adds_epi16(in[10], final_rounding);
4020b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[11] = _mm_adds_epi16(in[11], final_rounding);
4021b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[12] = _mm_adds_epi16(in[12], final_rounding);
4022b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[13] = _mm_adds_epi16(in[13], final_rounding);
4023b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[14] = _mm_adds_epi16(in[14], final_rounding);
4024b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[15] = _mm_adds_epi16(in[15], final_rounding);
4025b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[16] = _mm_adds_epi16(in[16], final_rounding);
4026b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[17] = _mm_adds_epi16(in[17], final_rounding);
4027b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[18] = _mm_adds_epi16(in[18], final_rounding);
4028b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[19] = _mm_adds_epi16(in[19], final_rounding);
4029b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[20] = _mm_adds_epi16(in[20], final_rounding);
4030b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[21] = _mm_adds_epi16(in[21], final_rounding);
4031b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[22] = _mm_adds_epi16(in[22], final_rounding);
4032b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[23] = _mm_adds_epi16(in[23], final_rounding);
4033b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[24] = _mm_adds_epi16(in[24], final_rounding);
4034b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[25] = _mm_adds_epi16(in[25], final_rounding);
4035b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[26] = _mm_adds_epi16(in[26], final_rounding);
4036b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[27] = _mm_adds_epi16(in[27], final_rounding);
4037b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[28] = _mm_adds_epi16(in[28], final_rounding);
4038b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[29] = _mm_adds_epi16(in[29], final_rounding);
4039b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[30] = _mm_adds_epi16(in[30], final_rounding);
4040b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[31] = _mm_adds_epi16(in[31], final_rounding);
4041b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
4042b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[0] = _mm_srai_epi16(in[0], 6);
4043b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[1] = _mm_srai_epi16(in[1], 6);
4044b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[2] = _mm_srai_epi16(in[2], 6);
4045b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[3] = _mm_srai_epi16(in[3], 6);
4046b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[4] = _mm_srai_epi16(in[4], 6);
4047b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[5] = _mm_srai_epi16(in[5], 6);
4048b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[6] = _mm_srai_epi16(in[6], 6);
4049b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[7] = _mm_srai_epi16(in[7], 6);
4050b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[8] = _mm_srai_epi16(in[8], 6);
4051b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[9] = _mm_srai_epi16(in[9], 6);
4052b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[10] = _mm_srai_epi16(in[10], 6);
4053b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[11] = _mm_srai_epi16(in[11], 6);
4054b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[12] = _mm_srai_epi16(in[12], 6);
4055b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[13] = _mm_srai_epi16(in[13], 6);
4056b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[14] = _mm_srai_epi16(in[14], 6);
4057b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[15] = _mm_srai_epi16(in[15], 6);
4058b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[16] = _mm_srai_epi16(in[16], 6);
4059b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[17] = _mm_srai_epi16(in[17], 6);
4060b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[18] = _mm_srai_epi16(in[18], 6);
4061b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[19] = _mm_srai_epi16(in[19], 6);
4062b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[20] = _mm_srai_epi16(in[20], 6);
4063b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[21] = _mm_srai_epi16(in[21], 6);
4064b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[22] = _mm_srai_epi16(in[22], 6);
4065b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[23] = _mm_srai_epi16(in[23], 6);
4066b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[24] = _mm_srai_epi16(in[24], 6);
4067b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[25] = _mm_srai_epi16(in[25], 6);
4068b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[26] = _mm_srai_epi16(in[26], 6);
4069b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[27] = _mm_srai_epi16(in[27], 6);
4070b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[28] = _mm_srai_epi16(in[28], 6);
4071b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[29] = _mm_srai_epi16(in[29], 6);
4072b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[30] = _mm_srai_epi16(in[30], 6);
4073b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in[31] = _mm_srai_epi16(in[31], 6);
4074b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
4075b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[0]);
4076b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[1]);
4077b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[2]);
4078b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[3]);
4079b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[4]);
4080b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[5]);
4081b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[6]);
4082b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[7]);
4083b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[8]);
4084b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[9]);
4085b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[10]);
4086b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[11]);
4087b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[12]);
4088b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[13]);
4089b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[14]);
4090b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[15]);
4091b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[16]);
4092b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[17]);
4093b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[18]);
4094b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[19]);
4095b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[20]);
4096b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[21]);
4097b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[22]);
4098b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[23]);
4099b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[24]);
4100b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[25]);
4101b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[26]);
4102b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[27]);
4103b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[28]);
4104b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[29]);
4105b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[30]);
4106b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      RECON_AND_STORE(dest, in[31]);
4107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
4108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      dest += 8 - (stride * 32);
4109ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    }
41105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}  //NOLINT
41115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
41125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
41135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  __m128i dc_value;
41145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i zero = _mm_setzero_si128();
41155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int a, i;
41165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
41175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  a = dct_const_round_shift(input[0] * cospi_16_64);
41185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  a = dct_const_round_shift(a * cospi_16_64);
41195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  a = ROUND_POWER_OF_TWO(a, 6);
41205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
41215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  dc_value = _mm_set1_epi16(a);
41225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
41235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (i = 0; i < 4; ++i) {
41245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
41565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dest += 8 - (stride * 32);
41575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
4158ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
4159