vp9_idct_intrin_sse2.c revision 9b35249446b07f40ac5fcc3205f2c048616efacc
1ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang/*
2ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *
4ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  Use of this source code is governed by a BSD-style license
5ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  that can be found in the LICENSE file in the root of the source
6ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  tree. An additional intellectual property rights grant can be found
7ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  in the file PATENTS.  All contributing project authors may
8ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  be found in the AUTHORS file in the root of the source tree.
9ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang */
10ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
11ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include <assert.h>
12ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include <emmintrin.h>  // SSE2
13ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "./vpx_config.h"
14ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vpx/vpx_integer.h"
15ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp9/common/vp9_common.h"
16ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp9/common/vp9_idct.h"
17ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
189b35249446b07f40ac5fcc3205f2c048616efacchkuang#define RECON_AND_STORE4X4(dest, in_x) \
199b35249446b07f40ac5fcc3205f2c048616efacchkuang{                                                     \
209b35249446b07f40ac5fcc3205f2c048616efacchkuang  __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
219b35249446b07f40ac5fcc3205f2c048616efacchkuang  d0 = _mm_unpacklo_epi8(d0, zero); \
229b35249446b07f40ac5fcc3205f2c048616efacchkuang  d0 = _mm_add_epi16(in_x, d0); \
239b35249446b07f40ac5fcc3205f2c048616efacchkuang  d0 = _mm_packus_epi16(d0, d0); \
249b35249446b07f40ac5fcc3205f2c048616efacchkuang  *(int *)dest = _mm_cvtsi128_si32(d0); \
259b35249446b07f40ac5fcc3205f2c048616efacchkuang  dest += stride; \
269b35249446b07f40ac5fcc3205f2c048616efacchkuang}
279b35249446b07f40ac5fcc3205f2c048616efacchkuang
285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
29ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i zero = _mm_setzero_si128();
30ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i eight = _mm_set1_epi16(8);
31ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
32ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                    (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
33ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                    (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
34ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                    (int16_t)cospi_8_64, (int16_t)cospi_24_64);
35ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
36ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i input0, input1, input2, input3;
37ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
38ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Rows
399b35249446b07f40ac5fcc3205f2c048616efacchkuang  input0 = _mm_load_si128((const __m128i *)input);
409b35249446b07f40ac5fcc3205f2c048616efacchkuang  input2 = _mm_load_si128((const __m128i *)(input + 8));
41ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
42ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Construct i3, i1, i3, i1, i2, i0, i2, i0
43ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_shufflelo_epi16(input0, 0xd8);
449b35249446b07f40ac5fcc3205f2c048616efacchkuang  input0 = _mm_shufflehi_epi16(input0, 0xd8);
45ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_shufflelo_epi16(input2, 0xd8);
469b35249446b07f40ac5fcc3205f2c048616efacchkuang  input2 = _mm_shufflehi_epi16(input2, 0xd8);
47ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
489b35249446b07f40ac5fcc3205f2c048616efacchkuang  input1 = _mm_unpackhi_epi32(input0, input0);
49ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_unpacklo_epi32(input0, input0);
509b35249446b07f40ac5fcc3205f2c048616efacchkuang  input3 = _mm_unpackhi_epi32(input2, input2);
51ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_unpacklo_epi32(input2, input2);
52ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
53ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 1
54ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_madd_epi16(input0, cst);
55ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_madd_epi16(input1, cst);
56ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_madd_epi16(input2, cst);
57ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_madd_epi16(input3, cst);
58ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
59ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_add_epi32(input0, rounding);
60ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_add_epi32(input1, rounding);
61ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_add_epi32(input2, rounding);
62ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_add_epi32(input3, rounding);
63ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
64ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
65ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
66ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
67ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
68ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
69ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 2
709b35249446b07f40ac5fcc3205f2c048616efacchkuang  input0 = _mm_packs_epi32(input0, input1);
719b35249446b07f40ac5fcc3205f2c048616efacchkuang  input1 = _mm_packs_epi32(input2, input3);
72ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
73ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Transpose
749b35249446b07f40ac5fcc3205f2c048616efacchkuang  input2 = _mm_unpacklo_epi16(input0, input1);
759b35249446b07f40ac5fcc3205f2c048616efacchkuang  input3 = _mm_unpackhi_epi16(input0, input1);
769b35249446b07f40ac5fcc3205f2c048616efacchkuang  input0 = _mm_unpacklo_epi32(input2, input3);
779b35249446b07f40ac5fcc3205f2c048616efacchkuang  input1 = _mm_unpackhi_epi32(input2, input3);
78ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
79ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Switch column2, column 3, and then, we got:
80ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // input2: column1, column 0;  input3: column2, column 3.
81ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_shuffle_epi32(input1, 0x4e);
82ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_add_epi16(input0, input1);
83ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_sub_epi16(input0, input1);
84ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
85ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Columns
86ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Construct i3, i1, i3, i1, i2, i0, i2, i0
879b35249446b07f40ac5fcc3205f2c048616efacchkuang  input0 = _mm_unpacklo_epi32(input2, input2);
889b35249446b07f40ac5fcc3205f2c048616efacchkuang  input1 = _mm_unpackhi_epi32(input2, input2);
899b35249446b07f40ac5fcc3205f2c048616efacchkuang  input2 = _mm_unpackhi_epi32(input3, input3);
90ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_unpacklo_epi32(input3, input3);
91ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
92ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 1
93ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_madd_epi16(input0, cst);
94ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_madd_epi16(input1, cst);
95ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_madd_epi16(input2, cst);
96ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_madd_epi16(input3, cst);
97ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
98ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_add_epi32(input0, rounding);
99ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_add_epi32(input1, rounding);
100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_add_epi32(input2, rounding);
101ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_add_epi32(input3, rounding);
102ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
103ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
105ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
106ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 2
1099b35249446b07f40ac5fcc3205f2c048616efacchkuang  input0 = _mm_packs_epi32(input0, input2);
1109b35249446b07f40ac5fcc3205f2c048616efacchkuang  input1 = _mm_packs_epi32(input1, input3);
111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Transpose
1139b35249446b07f40ac5fcc3205f2c048616efacchkuang  input2 = _mm_unpacklo_epi16(input0, input1);
1149b35249446b07f40ac5fcc3205f2c048616efacchkuang  input3 = _mm_unpackhi_epi16(input0, input1);
1159b35249446b07f40ac5fcc3205f2c048616efacchkuang  input0 = _mm_unpacklo_epi32(input2, input3);
1169b35249446b07f40ac5fcc3205f2c048616efacchkuang  input1 = _mm_unpackhi_epi32(input2, input3);
117ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
118ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Switch column2, column 3, and then, we got:
119ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // input2: column1, column 0;  input3: column2, column 3.
120ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_shuffle_epi32(input1, 0x4e);
121ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_add_epi16(input0, input1);
122ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_sub_epi16(input0, input1);
123ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
124ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Final round and shift
125ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_add_epi16(input2, eight);
126ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_add_epi16(input3, eight);
127ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
128ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_srai_epi16(input2, 4);
129ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_srai_epi16(input3, 4);
130ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1319b35249446b07f40ac5fcc3205f2c048616efacchkuang  // Reconstruction and Store
1329b35249446b07f40ac5fcc3205f2c048616efacchkuang  {
1339b35249446b07f40ac5fcc3205f2c048616efacchkuang     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
1349b35249446b07f40ac5fcc3205f2c048616efacchkuang     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
1359b35249446b07f40ac5fcc3205f2c048616efacchkuang     d0 = _mm_unpacklo_epi32(d0,
1369b35249446b07f40ac5fcc3205f2c048616efacchkuang          _mm_cvtsi32_si128(*(const int *) (dest + stride)));
1379b35249446b07f40ac5fcc3205f2c048616efacchkuang     d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(
1389b35249446b07f40ac5fcc3205f2c048616efacchkuang                    *(const int *) (dest + stride * 3)), d2);
1399b35249446b07f40ac5fcc3205f2c048616efacchkuang     d0 = _mm_unpacklo_epi8(d0, zero);
1409b35249446b07f40ac5fcc3205f2c048616efacchkuang     d2 = _mm_unpacklo_epi8(d2, zero);
1419b35249446b07f40ac5fcc3205f2c048616efacchkuang     d0 = _mm_add_epi16(d0, input2);
1429b35249446b07f40ac5fcc3205f2c048616efacchkuang     d2 = _mm_add_epi16(d2, input3);
1439b35249446b07f40ac5fcc3205f2c048616efacchkuang     d0 = _mm_packus_epi16(d0, d2);
1449b35249446b07f40ac5fcc3205f2c048616efacchkuang     // store input0
1459b35249446b07f40ac5fcc3205f2c048616efacchkuang     *(int *)dest = _mm_cvtsi128_si32(d0);
1469b35249446b07f40ac5fcc3205f2c048616efacchkuang     // store input1
1479b35249446b07f40ac5fcc3205f2c048616efacchkuang     d0 = _mm_srli_si128(d0, 4);
1489b35249446b07f40ac5fcc3205f2c048616efacchkuang     *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
1499b35249446b07f40ac5fcc3205f2c048616efacchkuang     // store input2
1509b35249446b07f40ac5fcc3205f2c048616efacchkuang     d0 = _mm_srli_si128(d0, 4);
1519b35249446b07f40ac5fcc3205f2c048616efacchkuang     *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
1529b35249446b07f40ac5fcc3205f2c048616efacchkuang     // store input3
1539b35249446b07f40ac5fcc3205f2c048616efacchkuang     d0 = _mm_srli_si128(d0, 4);
1549b35249446b07f40ac5fcc3205f2c048616efacchkuang     *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
155ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
156ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
157ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
15991037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i dc_value;
16091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i zero = _mm_setzero_si128();
16191037db265ecdd914a26e056cf69207b4f50924ehkuang  int a;
16291037db265ecdd914a26e056cf69207b4f50924ehkuang
16391037db265ecdd914a26e056cf69207b4f50924ehkuang  a = dct_const_round_shift(input[0] * cospi_16_64);
16491037db265ecdd914a26e056cf69207b4f50924ehkuang  a = dct_const_round_shift(a * cospi_16_64);
16591037db265ecdd914a26e056cf69207b4f50924ehkuang  a = ROUND_POWER_OF_TWO(a, 4);
16691037db265ecdd914a26e056cf69207b4f50924ehkuang
16791037db265ecdd914a26e056cf69207b4f50924ehkuang  dc_value = _mm_set1_epi16(a);
16891037db265ecdd914a26e056cf69207b4f50924ehkuang
16991037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, dc_value);
17091037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, dc_value);
17191037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, dc_value);
17291037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, dc_value);
17391037db265ecdd914a26e056cf69207b4f50924ehkuang}
17491037db265ecdd914a26e056cf69207b4f50924ehkuang
17591037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void transpose_4x4(__m128i *res) {
17691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
17791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]);
17891037db265ecdd914a26e056cf69207b4f50924ehkuang  res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
17991037db265ecdd914a26e056cf69207b4f50924ehkuang  res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
18091037db265ecdd914a26e056cf69207b4f50924ehkuang
18191037db265ecdd914a26e056cf69207b4f50924ehkuang  res[1] = _mm_unpackhi_epi64(res[0], res[0]);
18291037db265ecdd914a26e056cf69207b4f50924ehkuang  res[3] = _mm_unpackhi_epi64(res[2], res[2]);
18391037db265ecdd914a26e056cf69207b4f50924ehkuang}
18491037db265ecdd914a26e056cf69207b4f50924ehkuang
1855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void idct4_1d_sse2(__m128i *in) {
18691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
18791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
18891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
18991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
19091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
19191037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i u[8], v[8];
19291037db265ecdd914a26e056cf69207b4f50924ehkuang
19391037db265ecdd914a26e056cf69207b4f50924ehkuang  transpose_4x4(in);
19491037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 1
19591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(in[0], in[2]);
19691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpacklo_epi16(in[1], in[3]);
19791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
19891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
19991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
20091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
20191037db265ecdd914a26e056cf69207b4f50924ehkuang
20291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
20391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
20491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
20591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
20691037db265ecdd914a26e056cf69207b4f50924ehkuang
20791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
20891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
20991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
21091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
21191037db265ecdd914a26e056cf69207b4f50924ehkuang
21291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_packs_epi32(v[0], v[2]);
21391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_packs_epi32(v[1], v[3]);
21491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpackhi_epi64(u[0], u[0]);
21591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi64(u[1], u[1]);
21691037db265ecdd914a26e056cf69207b4f50924ehkuang
21791037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 2
21891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_add_epi16(u[0], u[3]);
21991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_add_epi16(u[1], u[2]);
22091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_sub_epi16(u[1], u[2]);
22191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_sub_epi16(u[0], u[3]);
22291037db265ecdd914a26e056cf69207b4f50924ehkuang}
22391037db265ecdd914a26e056cf69207b4f50924ehkuang
2245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void iadst4_1d_sse2(__m128i *in) {
22591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
22691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
22791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
22891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
22991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
23091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i kZero = _mm_set1_epi16(0);
23191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
23291037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i u[8], v[8], in7;
23391037db265ecdd914a26e056cf69207b4f50924ehkuang
23491037db265ecdd914a26e056cf69207b4f50924ehkuang  transpose_4x4(in);
23591037db265ecdd914a26e056cf69207b4f50924ehkuang  in7 = _mm_add_epi16(in[0], in[3]);
23691037db265ecdd914a26e056cf69207b4f50924ehkuang  in7 = _mm_sub_epi16(in7, in[2]);
23791037db265ecdd914a26e056cf69207b4f50924ehkuang
23891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(in[0], in[2]);
23991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpacklo_epi16(in[1], in[3]);
24091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(in7, kZero);
24191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpacklo_epi16(in[1], kZero);
24291037db265ecdd914a26e056cf69207b4f50924ehkuang
24391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
24491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
24591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
24691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
24791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
24891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
24991037db265ecdd914a26e056cf69207b4f50924ehkuang
25091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], v[1]);
25191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[3], v[4]);
25291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = v[2];
25391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(u[0], u[1]);
25491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_slli_epi32(v[5], 2);
25591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(u[3], v[5]);
25691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_sub_epi32(u[5], u[4]);
25791037db265ecdd914a26e056cf69207b4f50924ehkuang
25891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
25991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
26091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
26191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
26291037db265ecdd914a26e056cf69207b4f50924ehkuang
26391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
26491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
26591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
26691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
26791037db265ecdd914a26e056cf69207b4f50924ehkuang
26891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_packs_epi32(u[0], u[2]);
26991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_packs_epi32(u[1], u[3]);
27091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_unpackhi_epi64(in[0], in[0]);
27191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_unpackhi_epi64(in[1], in[1]);
27291037db265ecdd914a26e056cf69207b4f50924ehkuang}
27391037db265ecdd914a26e056cf69207b4f50924ehkuang
2745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
2755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            int tx_type) {
27691037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i in[4];
27791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i zero = _mm_setzero_si128();
27891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i eight = _mm_set1_epi16(8);
27991037db265ecdd914a26e056cf69207b4f50924ehkuang
2805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[0] = _mm_loadl_epi64((const __m128i *)input);
2815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[1] = _mm_loadl_epi64((const __m128i *)(input + 4));
2825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[2] = _mm_loadl_epi64((const __m128i *)(input + 8));
2835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[3] = _mm_loadl_epi64((const __m128i *)(input + 12));
28491037db265ecdd914a26e056cf69207b4f50924ehkuang
28591037db265ecdd914a26e056cf69207b4f50924ehkuang  switch (tx_type) {
28691037db265ecdd914a26e056cf69207b4f50924ehkuang    case 0:  // DCT_DCT
28791037db265ecdd914a26e056cf69207b4f50924ehkuang      idct4_1d_sse2(in);
28891037db265ecdd914a26e056cf69207b4f50924ehkuang      idct4_1d_sse2(in);
28991037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
29091037db265ecdd914a26e056cf69207b4f50924ehkuang    case 1:  // ADST_DCT
29191037db265ecdd914a26e056cf69207b4f50924ehkuang      idct4_1d_sse2(in);
29291037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst4_1d_sse2(in);
29391037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
29491037db265ecdd914a26e056cf69207b4f50924ehkuang    case 2:  // DCT_ADST
29591037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst4_1d_sse2(in);
29691037db265ecdd914a26e056cf69207b4f50924ehkuang      idct4_1d_sse2(in);
29791037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
29891037db265ecdd914a26e056cf69207b4f50924ehkuang    case 3:  // ADST_ADST
29991037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst4_1d_sse2(in);
30091037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst4_1d_sse2(in);
30191037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
30291037db265ecdd914a26e056cf69207b4f50924ehkuang    default:
30391037db265ecdd914a26e056cf69207b4f50924ehkuang      assert(0);
30491037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
30591037db265ecdd914a26e056cf69207b4f50924ehkuang  }
30691037db265ecdd914a26e056cf69207b4f50924ehkuang
30791037db265ecdd914a26e056cf69207b4f50924ehkuang  // Final round and shift
30891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_add_epi16(in[0], eight);
30991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_add_epi16(in[1], eight);
31091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_add_epi16(in[2], eight);
31191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_add_epi16(in[3], eight);
31291037db265ecdd914a26e056cf69207b4f50924ehkuang
31391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_srai_epi16(in[0], 4);
31491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_srai_epi16(in[1], 4);
31591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_srai_epi16(in[2], 4);
31691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_srai_epi16(in[3], 4);
31791037db265ecdd914a26e056cf69207b4f50924ehkuang
31891037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, in[0]);
31991037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, in[1]);
32091037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, in[2]);
32191037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, in[3]);
32291037db265ecdd914a26e056cf69207b4f50924ehkuang}
32391037db265ecdd914a26e056cf69207b4f50924ehkuang
324ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
325ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                      out0, out1, out2, out3, out4, out5, out6, out7) \
326ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {                                                     \
327ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
328ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
329ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
330ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
331ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
332ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
333ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
334ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
335ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                                        \
336ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
337ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
338ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
339ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
340ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
341ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
342ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
343ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
344ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                                            \
345ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
346ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
347ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
348ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
349ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
350ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
351ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
352ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
353ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
354ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
355ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, \
356ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                      out0, out1, out2, out3, out4, out5, out6, out7) \
357ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {                                                     \
358ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
359ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
360ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
361ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
362ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                                        \
363ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
364ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
365ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
366ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
367ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                                            \
368ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
369ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
370ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
371ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
372ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out4 = out5 = out6 = out7 = zero; \
373ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
374ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
375ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1, out2, out3) \
376ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {                                                     \
377ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
378ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
379ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
380ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
381ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                                        \
382ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);  /* i1 i0 */  \
383ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);  /* i3 i2 */  \
384ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in2 = _mm_unpacklo_epi32(tr0_2, tr0_3);  /* i5 i4 */  \
385ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in3 = _mm_unpackhi_epi32(tr0_2, tr0_3);  /* i7 i6 */  \
386ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
387ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
388ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang// Define Macro for multiplying elements by constants and adding them together.
389ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
390ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                               cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
391ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {   \
392ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp0 = _mm_madd_epi16(lo_0, cst0); \
393ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp1 = _mm_madd_epi16(hi_0, cst0); \
394ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp2 = _mm_madd_epi16(lo_0, cst1); \
395ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp3 = _mm_madd_epi16(hi_0, cst1); \
396ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp4 = _mm_madd_epi16(lo_1, cst2); \
397ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp5 = _mm_madd_epi16(hi_1, cst2); \
398ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp6 = _mm_madd_epi16(lo_1, cst3); \
399ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp7 = _mm_madd_epi16(hi_1, cst3); \
400ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      \
401ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp0 = _mm_add_epi32(tmp0, rounding); \
402ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp1 = _mm_add_epi32(tmp1, rounding); \
403ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp2 = _mm_add_epi32(tmp2, rounding); \
404ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp3 = _mm_add_epi32(tmp3, rounding); \
405ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp4 = _mm_add_epi32(tmp4, rounding); \
406ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp5 = _mm_add_epi32(tmp5, rounding); \
407ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp6 = _mm_add_epi32(tmp6, rounding); \
408ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp7 = _mm_add_epi32(tmp7, rounding); \
409ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      \
410ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
411ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
412ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
413ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
414ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
415ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
416ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
417ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
418ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      \
419ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      res0 = _mm_packs_epi32(tmp0, tmp1); \
420ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      res1 = _mm_packs_epi32(tmp2, tmp3); \
421ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      res2 = _mm_packs_epi32(tmp4, tmp5); \
422ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      res3 = _mm_packs_epi32(tmp6, tmp7); \
423ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
424ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
4255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#define IDCT8_1D  \
426ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage1 */      \
427ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
428ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
429ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
430ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
431ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
432ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
433ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
434ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                          stg1_1, stg1_2, stg1_3, stp1_4,      \
435ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                          stp1_7, stp1_5, stp1_6)              \
436ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  } \
437ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
438ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage2 */ \
439ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
440ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
441ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
442ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
443ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
444ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
445ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
446ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stg2_1, stg2_2, stg2_3, stp2_0,     \
447ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stp2_1, stp2_2, stp2_3)             \
448ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
449ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
450ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
451ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
452ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
453ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  } \
454ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
455ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage3 */ \
456ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
457ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
458ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
459ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
460ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
461ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
462ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
463ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
464ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
465ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
466ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
467ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
468ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
469ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
470ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding); \
471ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_add_epi32(tmp1, rounding); \
472ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding); \
473ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_add_epi32(tmp3, rounding); \
474ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
475ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
476ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
477ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
478ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
479ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
480ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
481ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
482ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  } \
483ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  \
484ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage4  */ \
485ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in0 = _mm_adds_epi16(stp1_0, stp2_7); \
486ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in1 = _mm_adds_epi16(stp1_1, stp1_6); \
487ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in2 = _mm_adds_epi16(stp1_2, stp1_5); \
488ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in3 = _mm_adds_epi16(stp1_3, stp2_4); \
489ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in4 = _mm_subs_epi16(stp1_3, stp2_4); \
490ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in5 = _mm_subs_epi16(stp1_2, stp1_5); \
491ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in6 = _mm_subs_epi16(stp1_1, stp1_6); \
492ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in7 = _mm_subs_epi16(stp1_0, stp2_7);
493ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
494ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define RECON_AND_STORE(dest, in_x) \
495ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {                                                     \
496ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
497ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      d0 = _mm_unpacklo_epi8(d0, zero); \
498f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      d0 = _mm_add_epi16(in_x, d0); \
499f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      d0 = _mm_packus_epi16(d0, d0); \
500f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      _mm_storel_epi64((__m128i *)(dest), d0); \
501ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      dest += stride; \
502ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
503ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
5045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
505ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i zero = _mm_setzero_si128();
506ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
507ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i final_rounding = _mm_set1_epi16(1<<4);
508ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
509ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
510ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
511ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
512ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
513ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
514ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
515ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
516ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
517ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
518ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
519ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
520ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
521ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int i;
522ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
523ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Load input data.
5245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in0 = _mm_load_si128((const __m128i *)input);
5255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
5265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
5275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
5285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
5295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
5305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
5315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
532ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
533ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // 2-D
534ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 2; i++) {
5355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
536ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
537ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                  in4, in5, in6, in7);
538ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
539ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // 4-stage 1D idct8x8
5405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    IDCT8_1D
541ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
542ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
543ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Final rounding and shift
544ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in0 = _mm_adds_epi16(in0, final_rounding);
545ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in1 = _mm_adds_epi16(in1, final_rounding);
546ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in2 = _mm_adds_epi16(in2, final_rounding);
547ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in3 = _mm_adds_epi16(in3, final_rounding);
548ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in4 = _mm_adds_epi16(in4, final_rounding);
549ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in5 = _mm_adds_epi16(in5, final_rounding);
550ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in6 = _mm_adds_epi16(in6, final_rounding);
551ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in7 = _mm_adds_epi16(in7, final_rounding);
552ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
553ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in0 = _mm_srai_epi16(in0, 5);
554ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in1 = _mm_srai_epi16(in1, 5);
555ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in2 = _mm_srai_epi16(in2, 5);
556ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in3 = _mm_srai_epi16(in3, 5);
557ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in4 = _mm_srai_epi16(in4, 5);
558ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in5 = _mm_srai_epi16(in5, 5);
559ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in6 = _mm_srai_epi16(in6, 5);
560ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in7 = _mm_srai_epi16(in7, 5);
561ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
562ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in0);
563ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in1);
564ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in2);
565ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in3);
566ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in4);
567ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in5);
568ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in6);
569ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in7);
570ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
571ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
5725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
573f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  __m128i dc_value;
574f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  const __m128i zero = _mm_setzero_si128();
575f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  int a;
576f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
577f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  a = dct_const_round_shift(input[0] * cospi_16_64);
578f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  a = dct_const_round_shift(a * cospi_16_64);
579f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  a = ROUND_POWER_OF_TWO(a, 5);
580f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
581f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  dc_value = _mm_set1_epi16(a);
582f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
583f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
584f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
585f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
586f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
587f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
588f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
589f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
590f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
591f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang}
592f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
59391037db265ecdd914a26e056cf69207b4f50924ehkuang// perform 8x8 transpose
59491037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
59591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
59691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
59791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
59891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
59991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
60091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
60191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
60291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
60391037db265ecdd914a26e056cf69207b4f50924ehkuang
60491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
60591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
60691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
60791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
60891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
60991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
61091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
61191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
61291037db265ecdd914a26e056cf69207b4f50924ehkuang
61391037db265ecdd914a26e056cf69207b4f50924ehkuang  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
61491037db265ecdd914a26e056cf69207b4f50924ehkuang  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
61591037db265ecdd914a26e056cf69207b4f50924ehkuang  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
61691037db265ecdd914a26e056cf69207b4f50924ehkuang  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
61791037db265ecdd914a26e056cf69207b4f50924ehkuang  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
61891037db265ecdd914a26e056cf69207b4f50924ehkuang  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
61991037db265ecdd914a26e056cf69207b4f50924ehkuang  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
62091037db265ecdd914a26e056cf69207b4f50924ehkuang  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
62191037db265ecdd914a26e056cf69207b4f50924ehkuang}
62291037db265ecdd914a26e056cf69207b4f50924ehkuang
6235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void idct8_1d_sse2(__m128i *in) {
62491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
62591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
62691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
62791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
62891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
62991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
63091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
63191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
63291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
63391037db265ecdd914a26e056cf69207b4f50924ehkuang
63491037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
63591037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
63691037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
63791037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
63891037db265ecdd914a26e056cf69207b4f50924ehkuang
63991037db265ecdd914a26e056cf69207b4f50924ehkuang  in0 = in[0];
64091037db265ecdd914a26e056cf69207b4f50924ehkuang  in1 = in[1];
64191037db265ecdd914a26e056cf69207b4f50924ehkuang  in2 = in[2];
64291037db265ecdd914a26e056cf69207b4f50924ehkuang  in3 = in[3];
64391037db265ecdd914a26e056cf69207b4f50924ehkuang  in4 = in[4];
64491037db265ecdd914a26e056cf69207b4f50924ehkuang  in5 = in[5];
64591037db265ecdd914a26e056cf69207b4f50924ehkuang  in6 = in[6];
64691037db265ecdd914a26e056cf69207b4f50924ehkuang  in7 = in[7];
64791037db265ecdd914a26e056cf69207b4f50924ehkuang
6485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
64991037db265ecdd914a26e056cf69207b4f50924ehkuang  TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
65091037db265ecdd914a26e056cf69207b4f50924ehkuang                in4, in5, in6, in7);
65191037db265ecdd914a26e056cf69207b4f50924ehkuang
65291037db265ecdd914a26e056cf69207b4f50924ehkuang  // 4-stage 1D idct8x8
6535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  IDCT8_1D
65491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = in0;
65591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = in1;
65691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = in2;
65791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = in3;
65891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = in4;
65991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = in5;
66091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = in6;
66191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = in7;
66291037db265ecdd914a26e056cf69207b4f50924ehkuang}
66391037db265ecdd914a26e056cf69207b4f50924ehkuang
6645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void iadst8_1d_sse2(__m128i *in) {
66591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
66691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
66791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
66891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
66991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
67091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
67191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
67291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
67391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
67491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
67591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
67691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
67791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
67891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__const_0 = _mm_set1_epi16(0);
67991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
68091037db265ecdd914a26e056cf69207b4f50924ehkuang
68191037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
68291037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
68391037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
68491037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
68591037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
68691037db265ecdd914a26e056cf69207b4f50924ehkuang
68791037db265ecdd914a26e056cf69207b4f50924ehkuang  // transpose
68891037db265ecdd914a26e056cf69207b4f50924ehkuang  array_transpose_8x8(in, in);
68991037db265ecdd914a26e056cf69207b4f50924ehkuang
69091037db265ecdd914a26e056cf69207b4f50924ehkuang  // properly aligned for butterfly input
69191037db265ecdd914a26e056cf69207b4f50924ehkuang  in0  = in[7];
69291037db265ecdd914a26e056cf69207b4f50924ehkuang  in1  = in[0];
69391037db265ecdd914a26e056cf69207b4f50924ehkuang  in2  = in[5];
69491037db265ecdd914a26e056cf69207b4f50924ehkuang  in3  = in[2];
69591037db265ecdd914a26e056cf69207b4f50924ehkuang  in4  = in[3];
69691037db265ecdd914a26e056cf69207b4f50924ehkuang  in5  = in[4];
69791037db265ecdd914a26e056cf69207b4f50924ehkuang  in6  = in[1];
69891037db265ecdd914a26e056cf69207b4f50924ehkuang  in7  = in[6];
69991037db265ecdd914a26e056cf69207b4f50924ehkuang
70091037db265ecdd914a26e056cf69207b4f50924ehkuang  // column transformation
70191037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 1
70291037db265ecdd914a26e056cf69207b4f50924ehkuang  // interleave and multiply/add into 32-bit integer
70391037db265ecdd914a26e056cf69207b4f50924ehkuang  s0 = _mm_unpacklo_epi16(in0, in1);
70491037db265ecdd914a26e056cf69207b4f50924ehkuang  s1 = _mm_unpackhi_epi16(in0, in1);
70591037db265ecdd914a26e056cf69207b4f50924ehkuang  s2 = _mm_unpacklo_epi16(in2, in3);
70691037db265ecdd914a26e056cf69207b4f50924ehkuang  s3 = _mm_unpackhi_epi16(in2, in3);
70791037db265ecdd914a26e056cf69207b4f50924ehkuang  s4 = _mm_unpacklo_epi16(in4, in5);
70891037db265ecdd914a26e056cf69207b4f50924ehkuang  s5 = _mm_unpackhi_epi16(in4, in5);
70991037db265ecdd914a26e056cf69207b4f50924ehkuang  s6 = _mm_unpacklo_epi16(in6, in7);
71091037db265ecdd914a26e056cf69207b4f50924ehkuang  s7 = _mm_unpackhi_epi16(in6, in7);
71191037db265ecdd914a26e056cf69207b4f50924ehkuang
71291037db265ecdd914a26e056cf69207b4f50924ehkuang  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
71391037db265ecdd914a26e056cf69207b4f50924ehkuang  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
71491037db265ecdd914a26e056cf69207b4f50924ehkuang  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
71591037db265ecdd914a26e056cf69207b4f50924ehkuang  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
71691037db265ecdd914a26e056cf69207b4f50924ehkuang  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
71791037db265ecdd914a26e056cf69207b4f50924ehkuang  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
71891037db265ecdd914a26e056cf69207b4f50924ehkuang  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
71991037db265ecdd914a26e056cf69207b4f50924ehkuang  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
72091037db265ecdd914a26e056cf69207b4f50924ehkuang  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
72191037db265ecdd914a26e056cf69207b4f50924ehkuang  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
72291037db265ecdd914a26e056cf69207b4f50924ehkuang  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
72391037db265ecdd914a26e056cf69207b4f50924ehkuang  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
72491037db265ecdd914a26e056cf69207b4f50924ehkuang  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
72591037db265ecdd914a26e056cf69207b4f50924ehkuang  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
72691037db265ecdd914a26e056cf69207b4f50924ehkuang  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
72791037db265ecdd914a26e056cf69207b4f50924ehkuang  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
72891037db265ecdd914a26e056cf69207b4f50924ehkuang
72991037db265ecdd914a26e056cf69207b4f50924ehkuang  // addition
73091037db265ecdd914a26e056cf69207b4f50924ehkuang  w0 = _mm_add_epi32(u0, u8);
73191037db265ecdd914a26e056cf69207b4f50924ehkuang  w1 = _mm_add_epi32(u1, u9);
73291037db265ecdd914a26e056cf69207b4f50924ehkuang  w2 = _mm_add_epi32(u2, u10);
73391037db265ecdd914a26e056cf69207b4f50924ehkuang  w3 = _mm_add_epi32(u3, u11);
73491037db265ecdd914a26e056cf69207b4f50924ehkuang  w4 = _mm_add_epi32(u4, u12);
73591037db265ecdd914a26e056cf69207b4f50924ehkuang  w5 = _mm_add_epi32(u5, u13);
73691037db265ecdd914a26e056cf69207b4f50924ehkuang  w6 = _mm_add_epi32(u6, u14);
73791037db265ecdd914a26e056cf69207b4f50924ehkuang  w7 = _mm_add_epi32(u7, u15);
73891037db265ecdd914a26e056cf69207b4f50924ehkuang  w8 = _mm_sub_epi32(u0, u8);
73991037db265ecdd914a26e056cf69207b4f50924ehkuang  w9 = _mm_sub_epi32(u1, u9);
74091037db265ecdd914a26e056cf69207b4f50924ehkuang  w10 = _mm_sub_epi32(u2, u10);
74191037db265ecdd914a26e056cf69207b4f50924ehkuang  w11 = _mm_sub_epi32(u3, u11);
74291037db265ecdd914a26e056cf69207b4f50924ehkuang  w12 = _mm_sub_epi32(u4, u12);
74391037db265ecdd914a26e056cf69207b4f50924ehkuang  w13 = _mm_sub_epi32(u5, u13);
74491037db265ecdd914a26e056cf69207b4f50924ehkuang  w14 = _mm_sub_epi32(u6, u14);
74591037db265ecdd914a26e056cf69207b4f50924ehkuang  w15 = _mm_sub_epi32(u7, u15);
74691037db265ecdd914a26e056cf69207b4f50924ehkuang
74791037db265ecdd914a26e056cf69207b4f50924ehkuang  // shift and rounding
74891037db265ecdd914a26e056cf69207b4f50924ehkuang  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
74991037db265ecdd914a26e056cf69207b4f50924ehkuang  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
75091037db265ecdd914a26e056cf69207b4f50924ehkuang  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
75191037db265ecdd914a26e056cf69207b4f50924ehkuang  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
75291037db265ecdd914a26e056cf69207b4f50924ehkuang  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
75391037db265ecdd914a26e056cf69207b4f50924ehkuang  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
75491037db265ecdd914a26e056cf69207b4f50924ehkuang  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
75591037db265ecdd914a26e056cf69207b4f50924ehkuang  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
75691037db265ecdd914a26e056cf69207b4f50924ehkuang  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
75791037db265ecdd914a26e056cf69207b4f50924ehkuang  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
75891037db265ecdd914a26e056cf69207b4f50924ehkuang  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
75991037db265ecdd914a26e056cf69207b4f50924ehkuang  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
76091037db265ecdd914a26e056cf69207b4f50924ehkuang  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
76191037db265ecdd914a26e056cf69207b4f50924ehkuang  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
76291037db265ecdd914a26e056cf69207b4f50924ehkuang  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
76391037db265ecdd914a26e056cf69207b4f50924ehkuang  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
76491037db265ecdd914a26e056cf69207b4f50924ehkuang
76591037db265ecdd914a26e056cf69207b4f50924ehkuang  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
76691037db265ecdd914a26e056cf69207b4f50924ehkuang  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
76791037db265ecdd914a26e056cf69207b4f50924ehkuang  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
76891037db265ecdd914a26e056cf69207b4f50924ehkuang  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
76991037db265ecdd914a26e056cf69207b4f50924ehkuang  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
77091037db265ecdd914a26e056cf69207b4f50924ehkuang  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
77191037db265ecdd914a26e056cf69207b4f50924ehkuang  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
77291037db265ecdd914a26e056cf69207b4f50924ehkuang  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
77391037db265ecdd914a26e056cf69207b4f50924ehkuang  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
77491037db265ecdd914a26e056cf69207b4f50924ehkuang  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
77591037db265ecdd914a26e056cf69207b4f50924ehkuang  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
77691037db265ecdd914a26e056cf69207b4f50924ehkuang  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
77791037db265ecdd914a26e056cf69207b4f50924ehkuang  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
77891037db265ecdd914a26e056cf69207b4f50924ehkuang  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
77991037db265ecdd914a26e056cf69207b4f50924ehkuang  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
78091037db265ecdd914a26e056cf69207b4f50924ehkuang  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
78191037db265ecdd914a26e056cf69207b4f50924ehkuang
78291037db265ecdd914a26e056cf69207b4f50924ehkuang  // back to 16-bit and pack 8 integers into __m128i
78391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_packs_epi32(u0, u1);
78491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_packs_epi32(u2, u3);
78591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_packs_epi32(u4, u5);
78691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_packs_epi32(u6, u7);
78791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = _mm_packs_epi32(u8, u9);
78891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_packs_epi32(u10, u11);
78991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = _mm_packs_epi32(u12, u13);
79091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_packs_epi32(u14, u15);
79191037db265ecdd914a26e056cf69207b4f50924ehkuang
79291037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 2
79391037db265ecdd914a26e056cf69207b4f50924ehkuang  s0 = _mm_add_epi16(in[0], in[2]);
79491037db265ecdd914a26e056cf69207b4f50924ehkuang  s1 = _mm_add_epi16(in[1], in[3]);
79591037db265ecdd914a26e056cf69207b4f50924ehkuang  s2 = _mm_sub_epi16(in[0], in[2]);
79691037db265ecdd914a26e056cf69207b4f50924ehkuang  s3 = _mm_sub_epi16(in[1], in[3]);
79791037db265ecdd914a26e056cf69207b4f50924ehkuang  u0 = _mm_unpacklo_epi16(in[4], in[5]);
79891037db265ecdd914a26e056cf69207b4f50924ehkuang  u1 = _mm_unpackhi_epi16(in[4], in[5]);
79991037db265ecdd914a26e056cf69207b4f50924ehkuang  u2 = _mm_unpacklo_epi16(in[6], in[7]);
80091037db265ecdd914a26e056cf69207b4f50924ehkuang  u3 = _mm_unpackhi_epi16(in[6], in[7]);
80191037db265ecdd914a26e056cf69207b4f50924ehkuang
80291037db265ecdd914a26e056cf69207b4f50924ehkuang  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
80391037db265ecdd914a26e056cf69207b4f50924ehkuang  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
80491037db265ecdd914a26e056cf69207b4f50924ehkuang  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
80591037db265ecdd914a26e056cf69207b4f50924ehkuang  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
80691037db265ecdd914a26e056cf69207b4f50924ehkuang  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
80791037db265ecdd914a26e056cf69207b4f50924ehkuang  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
80891037db265ecdd914a26e056cf69207b4f50924ehkuang  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
80991037db265ecdd914a26e056cf69207b4f50924ehkuang  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
81091037db265ecdd914a26e056cf69207b4f50924ehkuang
81191037db265ecdd914a26e056cf69207b4f50924ehkuang  w0 = _mm_add_epi32(v0, v4);
81291037db265ecdd914a26e056cf69207b4f50924ehkuang  w1 = _mm_add_epi32(v1, v5);
81391037db265ecdd914a26e056cf69207b4f50924ehkuang  w2 = _mm_add_epi32(v2, v6);
81491037db265ecdd914a26e056cf69207b4f50924ehkuang  w3 = _mm_add_epi32(v3, v7);
81591037db265ecdd914a26e056cf69207b4f50924ehkuang  w4 = _mm_sub_epi32(v0, v4);
81691037db265ecdd914a26e056cf69207b4f50924ehkuang  w5 = _mm_sub_epi32(v1, v5);
81791037db265ecdd914a26e056cf69207b4f50924ehkuang  w6 = _mm_sub_epi32(v2, v6);
81891037db265ecdd914a26e056cf69207b4f50924ehkuang  w7 = _mm_sub_epi32(v3, v7);
81991037db265ecdd914a26e056cf69207b4f50924ehkuang
82091037db265ecdd914a26e056cf69207b4f50924ehkuang  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
82191037db265ecdd914a26e056cf69207b4f50924ehkuang  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
82291037db265ecdd914a26e056cf69207b4f50924ehkuang  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
82391037db265ecdd914a26e056cf69207b4f50924ehkuang  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
82491037db265ecdd914a26e056cf69207b4f50924ehkuang  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
82591037db265ecdd914a26e056cf69207b4f50924ehkuang  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
82691037db265ecdd914a26e056cf69207b4f50924ehkuang  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
82791037db265ecdd914a26e056cf69207b4f50924ehkuang  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
82891037db265ecdd914a26e056cf69207b4f50924ehkuang
82991037db265ecdd914a26e056cf69207b4f50924ehkuang  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
83091037db265ecdd914a26e056cf69207b4f50924ehkuang  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
83191037db265ecdd914a26e056cf69207b4f50924ehkuang  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
83291037db265ecdd914a26e056cf69207b4f50924ehkuang  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
83391037db265ecdd914a26e056cf69207b4f50924ehkuang  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
83491037db265ecdd914a26e056cf69207b4f50924ehkuang  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
83591037db265ecdd914a26e056cf69207b4f50924ehkuang  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
83691037db265ecdd914a26e056cf69207b4f50924ehkuang  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
83791037db265ecdd914a26e056cf69207b4f50924ehkuang
83891037db265ecdd914a26e056cf69207b4f50924ehkuang  // back to 16-bit intergers
83991037db265ecdd914a26e056cf69207b4f50924ehkuang  s4 = _mm_packs_epi32(u0, u1);
84091037db265ecdd914a26e056cf69207b4f50924ehkuang  s5 = _mm_packs_epi32(u2, u3);
84191037db265ecdd914a26e056cf69207b4f50924ehkuang  s6 = _mm_packs_epi32(u4, u5);
84291037db265ecdd914a26e056cf69207b4f50924ehkuang  s7 = _mm_packs_epi32(u6, u7);
84391037db265ecdd914a26e056cf69207b4f50924ehkuang
84491037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 3
84591037db265ecdd914a26e056cf69207b4f50924ehkuang  u0 = _mm_unpacklo_epi16(s2, s3);
84691037db265ecdd914a26e056cf69207b4f50924ehkuang  u1 = _mm_unpackhi_epi16(s2, s3);
84791037db265ecdd914a26e056cf69207b4f50924ehkuang  u2 = _mm_unpacklo_epi16(s6, s7);
84891037db265ecdd914a26e056cf69207b4f50924ehkuang  u3 = _mm_unpackhi_epi16(s6, s7);
84991037db265ecdd914a26e056cf69207b4f50924ehkuang
85091037db265ecdd914a26e056cf69207b4f50924ehkuang  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
85191037db265ecdd914a26e056cf69207b4f50924ehkuang  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
85291037db265ecdd914a26e056cf69207b4f50924ehkuang  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
85391037db265ecdd914a26e056cf69207b4f50924ehkuang  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
85491037db265ecdd914a26e056cf69207b4f50924ehkuang  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
85591037db265ecdd914a26e056cf69207b4f50924ehkuang  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
85691037db265ecdd914a26e056cf69207b4f50924ehkuang  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
85791037db265ecdd914a26e056cf69207b4f50924ehkuang  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
85891037db265ecdd914a26e056cf69207b4f50924ehkuang
85991037db265ecdd914a26e056cf69207b4f50924ehkuang  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
86091037db265ecdd914a26e056cf69207b4f50924ehkuang  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
86191037db265ecdd914a26e056cf69207b4f50924ehkuang  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
86291037db265ecdd914a26e056cf69207b4f50924ehkuang  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
86391037db265ecdd914a26e056cf69207b4f50924ehkuang  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
86491037db265ecdd914a26e056cf69207b4f50924ehkuang  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
86591037db265ecdd914a26e056cf69207b4f50924ehkuang  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
86691037db265ecdd914a26e056cf69207b4f50924ehkuang  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
86791037db265ecdd914a26e056cf69207b4f50924ehkuang
86891037db265ecdd914a26e056cf69207b4f50924ehkuang  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
86991037db265ecdd914a26e056cf69207b4f50924ehkuang  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
87091037db265ecdd914a26e056cf69207b4f50924ehkuang  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
87191037db265ecdd914a26e056cf69207b4f50924ehkuang  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
87291037db265ecdd914a26e056cf69207b4f50924ehkuang  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
87391037db265ecdd914a26e056cf69207b4f50924ehkuang  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
87491037db265ecdd914a26e056cf69207b4f50924ehkuang  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
87591037db265ecdd914a26e056cf69207b4f50924ehkuang  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
87691037db265ecdd914a26e056cf69207b4f50924ehkuang
87791037db265ecdd914a26e056cf69207b4f50924ehkuang  s2 = _mm_packs_epi32(v0, v1);
87891037db265ecdd914a26e056cf69207b4f50924ehkuang  s3 = _mm_packs_epi32(v2, v3);
87991037db265ecdd914a26e056cf69207b4f50924ehkuang  s6 = _mm_packs_epi32(v4, v5);
88091037db265ecdd914a26e056cf69207b4f50924ehkuang  s7 = _mm_packs_epi32(v6, v7);
88191037db265ecdd914a26e056cf69207b4f50924ehkuang
88291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = s0;
88391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_sub_epi16(k__const_0, s4);
88491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = s6;
88591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_sub_epi16(k__const_0, s2);
88691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = s3;
88791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_sub_epi16(k__const_0, s7);
88891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = s5;
88991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_sub_epi16(k__const_0, s1);
89091037db265ecdd914a26e056cf69207b4f50924ehkuang}
89191037db265ecdd914a26e056cf69207b4f50924ehkuang
89291037db265ecdd914a26e056cf69207b4f50924ehkuang
8935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
8945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            int tx_type) {
89591037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i in[8];
89691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i zero = _mm_setzero_si128();
89791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i final_rounding = _mm_set1_epi16(1<<4);
89891037db265ecdd914a26e056cf69207b4f50924ehkuang
89991037db265ecdd914a26e056cf69207b4f50924ehkuang  // load input data
9005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[0] = _mm_load_si128((const __m128i *)input);
9015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1));
9025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2));
9035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3));
9045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4));
9055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5));
9065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6));
9075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));
90891037db265ecdd914a26e056cf69207b4f50924ehkuang
90991037db265ecdd914a26e056cf69207b4f50924ehkuang  switch (tx_type) {
91091037db265ecdd914a26e056cf69207b4f50924ehkuang    case 0:  // DCT_DCT
91191037db265ecdd914a26e056cf69207b4f50924ehkuang      idct8_1d_sse2(in);
91291037db265ecdd914a26e056cf69207b4f50924ehkuang      idct8_1d_sse2(in);
91391037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
91491037db265ecdd914a26e056cf69207b4f50924ehkuang    case 1:  // ADST_DCT
91591037db265ecdd914a26e056cf69207b4f50924ehkuang      idct8_1d_sse2(in);
91691037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst8_1d_sse2(in);
91791037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
91891037db265ecdd914a26e056cf69207b4f50924ehkuang    case 2:  // DCT_ADST
91991037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst8_1d_sse2(in);
92091037db265ecdd914a26e056cf69207b4f50924ehkuang      idct8_1d_sse2(in);
92191037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
92291037db265ecdd914a26e056cf69207b4f50924ehkuang    case 3:  // ADST_ADST
92391037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst8_1d_sse2(in);
92491037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst8_1d_sse2(in);
92591037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
92691037db265ecdd914a26e056cf69207b4f50924ehkuang    default:
92791037db265ecdd914a26e056cf69207b4f50924ehkuang      assert(0);
92891037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
92991037db265ecdd914a26e056cf69207b4f50924ehkuang  }
93091037db265ecdd914a26e056cf69207b4f50924ehkuang
93191037db265ecdd914a26e056cf69207b4f50924ehkuang  // Final rounding and shift
93291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_adds_epi16(in[0], final_rounding);
93391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_adds_epi16(in[1], final_rounding);
93491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_adds_epi16(in[2], final_rounding);
93591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_adds_epi16(in[3], final_rounding);
93691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = _mm_adds_epi16(in[4], final_rounding);
93791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_adds_epi16(in[5], final_rounding);
93891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = _mm_adds_epi16(in[6], final_rounding);
93991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_adds_epi16(in[7], final_rounding);
94091037db265ecdd914a26e056cf69207b4f50924ehkuang
94191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_srai_epi16(in[0], 5);
94291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_srai_epi16(in[1], 5);
94391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_srai_epi16(in[2], 5);
94491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_srai_epi16(in[3], 5);
94591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = _mm_srai_epi16(in[4], 5);
94691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_srai_epi16(in[5], 5);
94791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = _mm_srai_epi16(in[6], 5);
94891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_srai_epi16(in[7], 5);
94991037db265ecdd914a26e056cf69207b4f50924ehkuang
95091037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[0]);
95191037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[1]);
95291037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[2]);
95391037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[3]);
95491037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[4]);
95591037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[5]);
95691037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[6]);
95791037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[7]);
95891037db265ecdd914a26e056cf69207b4f50924ehkuang}
95991037db265ecdd914a26e056cf69207b4f50924ehkuang
9605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
961ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i zero = _mm_setzero_si128();
962ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
963ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i final_rounding = _mm_set1_epi16(1<<4);
964ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
965ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
966ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
967ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
968ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
969ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
970ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
971ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
972ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
973ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
974ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
975ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
976ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
977ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
978ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
979ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Rows. Load 4-row input data.
9805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in0 = _mm_load_si128((const __m128i *)input);
9815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
9825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
9835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
984ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
985ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // 8x4 Transpose
986ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3)
987ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
988ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage1
9895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  { //NOLINT
990ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3);
991ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2);
992ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
993ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_17, stg1_0);
994ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_17, stg1_1);
995ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_madd_epi16(lo_35, stg1_2);
996ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_madd_epi16(lo_35, stg1_3);
997ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
998ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding);
999ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding);
1000ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_add_epi32(tmp4, rounding);
1001ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_add_epi32(tmp6, rounding);
1002ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1003ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1004ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
1005ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
1006ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1007ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_4 = _mm_packs_epi32(tmp0, zero);
1008ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_7 = _mm_packs_epi32(tmp2, zero);
1009ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_5 = _mm_packs_epi32(tmp4, zero);
1010ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_6 = _mm_packs_epi32(tmp6, zero);
1011ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
1012ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1013ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage2
10145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  { //NOLINT
1015ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2);
1016ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3);
1017ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1018ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_04, stg2_0);
1019ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_04, stg2_1);
1020ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_madd_epi16(lo_26, stg2_2);
1021ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_madd_epi16(lo_26, stg2_3);
1022ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1023ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding);
1024ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding);
1025ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_add_epi32(tmp4, rounding);
1026ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_add_epi32(tmp6, rounding);
1027ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1028ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1029ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
1030ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
1031ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1032ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_0 = _mm_packs_epi32(tmp0, zero);
1033ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_1 = _mm_packs_epi32(tmp2, zero);
1034ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_2 = _mm_packs_epi32(tmp4, zero);
1035ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_3 = _mm_packs_epi32(tmp6, zero);
1036ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1037ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5);
1038ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5);
1039ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6);
1040ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6);
1041ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
1042ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1043ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage3
10445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  { //NOLINT
1045ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
1046ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3);
1047ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2);
1048ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2);
1049ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3);
1050ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1051ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_56, stg3_0);
1052ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
1053ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1054ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding);
1055ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding);
1056ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1057ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1058ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1059ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_5 = _mm_packs_epi32(tmp0, zero);
1060ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_6 = _mm_packs_epi32(tmp2, zero);
1061ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
1062ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1063ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage4
1064ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in0 = _mm_adds_epi16(stp1_0, stp2_7);
1065ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in1 = _mm_adds_epi16(stp1_1, stp1_6);
1066ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in2 = _mm_adds_epi16(stp1_2, stp1_5);
1067ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in3 = _mm_adds_epi16(stp1_3, stp2_4);
1068ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in4 = _mm_subs_epi16(stp1_3, stp2_4);
1069ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in5 = _mm_subs_epi16(stp1_2, stp1_5);
1070ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in6 = _mm_subs_epi16(stp1_1, stp1_6);
1071ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in7 = _mm_subs_epi16(stp1_0, stp2_7);
1072ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1073ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Columns. 4x8 Transpose
1074ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
1075ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                in4, in5, in6, in7)
1076ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1077ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // 1D idct8x8
10785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  IDCT8_1D
1079ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1080ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Final rounding and shift
1081ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in0 = _mm_adds_epi16(in0, final_rounding);
1082ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in1 = _mm_adds_epi16(in1, final_rounding);
1083ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in2 = _mm_adds_epi16(in2, final_rounding);
1084ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in3 = _mm_adds_epi16(in3, final_rounding);
1085ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in4 = _mm_adds_epi16(in4, final_rounding);
1086ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in5 = _mm_adds_epi16(in5, final_rounding);
1087ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in6 = _mm_adds_epi16(in6, final_rounding);
1088ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in7 = _mm_adds_epi16(in7, final_rounding);
1089ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1090ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in0 = _mm_srai_epi16(in0, 5);
1091ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in1 = _mm_srai_epi16(in1, 5);
1092ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in2 = _mm_srai_epi16(in2, 5);
1093ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in3 = _mm_srai_epi16(in3, 5);
1094ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in4 = _mm_srai_epi16(in4, 5);
1095ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in5 = _mm_srai_epi16(in5, 5);
1096ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in6 = _mm_srai_epi16(in6, 5);
1097ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in7 = _mm_srai_epi16(in7, 5);
1098ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1099ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in0);
1100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in1);
1101ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in2);
1102ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in3);
1103ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in4);
1104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in5);
1105ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in6);
1106ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in7);
1107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
1108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
11095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#define IDCT16_1D \
1110ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage2 */ \
1111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
1112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \
1113ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \
1114ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7);   \
1115ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_9_7 = _mm_unpackhi_epi16(in9, in7);   \
1116ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_5_11 = _mm_unpacklo_epi16(in5, in11); \
1117ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \
1118ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \
1119ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \
1120ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1121ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
1122ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stg2_0, stg2_1, stg2_2, stg2_3, \
1123ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stp2_8, stp2_15, stp2_9, stp2_14) \
1124ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1125ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
1126ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stg2_4, stg2_5, stg2_6, stg2_7, \
1127ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stp2_10, stp2_13, stp2_11, stp2_12) \
1128ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  } \
1129ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1130ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage3 */ \
1131ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
1132ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \
1133ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \
1134ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \
1135ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \
1136ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1137ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
1138ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stg3_0, stg3_1, stg3_2, stg3_3, \
1139ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stp1_4, stp1_7, stp1_5, stp1_6) \
1140ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1141ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);  \
1142ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);    \
1143ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
1144ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
1145ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1146ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
1147ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
1148ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
1149ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
1150ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  } \
1151ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  \
1152ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage4 */ \
1153ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
1154ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \
1155ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \
1156ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \
1157ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \
1158ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1159ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
1160ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
1161ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1162ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1163ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1164ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
1165ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stg4_0, stg4_1, stg4_2, stg4_3, \
1166ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stp2_0, stp2_1, stp2_2, stp2_3) \
1167ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1168ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
1169ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
1170ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
1171ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
1172ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1173ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
1174ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stg4_4, stg4_5, stg4_6, stg4_7, \
1175ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stp2_9, stp2_14, stp2_10, stp2_13) \
1176ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  } \
1177ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1178ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage5 */ \
1179ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
1180ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
1181ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
1182ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1183ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
1184ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
1185ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
1186ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
1187ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1188ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
1189ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
1190ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
1191ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
1192ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1193ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding); \
1194ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_add_epi32(tmp1, rounding); \
1195ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding); \
1196ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_add_epi32(tmp3, rounding); \
1197ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1198ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
1199ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
1200ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
1201ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
1202ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1203ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
1204ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
1205ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1206ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
1207ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
1208ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
1209ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
1210ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1211ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
1212ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
1213ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
1214ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
1215ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  } \
1216ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1217ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage6 */ \
1218ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
1219ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1220ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1221ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
1222ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
1223ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1224ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
1225ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
1226ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
1227ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
1228ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
1229ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1230ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1231ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1232ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1233ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
1234ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stg6_0, stg4_0, stg6_0, stg4_0, \
1235ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stp2_10, stp2_13, stp2_11, stp2_12) \
1236ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
1237ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
12385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
12395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                int stride) {
1240ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
1241ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i final_rounding = _mm_set1_epi16(1<<5);
1242ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i zero = _mm_setzero_si128();
1243ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1244ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1245ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
1246ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1247ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
1248ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1249ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
1250ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1251ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
1252ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1253ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1254ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
1255ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1256ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
1257ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1258ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
1259ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1260ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1261ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
1262ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1263ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
1264ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1265ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1266ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1267ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1268ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1269ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero,
1270ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero,
1271ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          in10 = zero, in11 = zero, in12 = zero, in13 = zero,
1272ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          in14 = zero, in15 = zero;
1273ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero,
1274ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero,
1275ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          l12 = zero, l13 = zero, l14 = zero, l15 = zero;
1276ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i r0 = zero, r1 = zero, r2 = zero, r3 = zero, r4 = zero, r5 = zero,
1277ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          r6 = zero, r7 = zero, r8 = zero, r9 = zero, r10 = zero, r11 = zero,
1278ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          r12 = zero, r13 = zero, r14 = zero, r15 = zero;
1279ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
1280ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
1281ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_8_0, stp1_12_0;
1282ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
1283ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
1284ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1285ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int i;
1286ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1287ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // We work on a 8x16 block each time, and loop 4 times for 2-D 16x16 idct.
1288ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 4; i++) {
1289ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // 1-D idct
1290ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    if (i < 2) {
1291ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      if (i == 1) input += 128;
1292ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1293ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Load input data.
12945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in0 = _mm_load_si128((const __m128i *)input);
12955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in8 = _mm_load_si128((const __m128i *)(input + 8 * 1));
12965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in1 = _mm_load_si128((const __m128i *)(input + 8 * 2));
12975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in9 = _mm_load_si128((const __m128i *)(input + 8 * 3));
12985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in2 = _mm_load_si128((const __m128i *)(input + 8 * 4));
12995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in10 = _mm_load_si128((const __m128i *)(input + 8 * 5));
13005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in3 = _mm_load_si128((const __m128i *)(input + 8 * 6));
13015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in11 = _mm_load_si128((const __m128i *)(input + 8 * 7));
13025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in4 = _mm_load_si128((const __m128i *)(input + 8 * 8));
13035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in12 = _mm_load_si128((const __m128i *)(input + 8 * 9));
13045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in5 = _mm_load_si128((const __m128i *)(input + 8 * 10));
13055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in13 = _mm_load_si128((const __m128i *)(input + 8 * 11));
13065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in6 = _mm_load_si128((const __m128i *)(input + 8 * 12));
13075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in14 = _mm_load_si128((const __m128i *)(input + 8 * 13));
13085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in7 = _mm_load_si128((const __m128i *)(input + 8 * 14));
13095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in15 = _mm_load_si128((const __m128i *)(input + 8 * 15));
1310ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1311ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
1312ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in4, in5, in6, in7);
1313ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
1314ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in10, in11, in12, in13, in14, in15);
1315ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    }
1316ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1317ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    if (i == 2) {
1318ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4,
1319ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in5, in6, in7);
1320ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(r0, r1, r2, r3, r4, r5, r6, r7, in8, in9, in10, in11, in12,
1321ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in13, in14, in15);
1322ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    }
1323ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1324ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    if (i == 3) {
1325ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3,
1326ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in4, in5, in6, in7);
1327ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(r8, r9, r10, r11, r12, r13, r14, r15, in8, in9, in10, in11,
1328ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in12, in13, in14, in15);
1329ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    }
1330ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
13315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    IDCT16_1D
1332ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1333ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // Stage7
1334ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    if (i == 0) {
1335ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Left 8x16
1336ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l0 = _mm_add_epi16(stp2_0, stp1_15);
1337ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l1 = _mm_add_epi16(stp2_1, stp1_14);
1338ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l2 = _mm_add_epi16(stp2_2, stp2_13);
1339ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l3 = _mm_add_epi16(stp2_3, stp2_12);
1340ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l4 = _mm_add_epi16(stp2_4, stp2_11);
1341ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l5 = _mm_add_epi16(stp2_5, stp2_10);
1342ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l6 = _mm_add_epi16(stp2_6, stp1_9);
1343ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l7 = _mm_add_epi16(stp2_7, stp1_8);
1344ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l8 = _mm_sub_epi16(stp2_7, stp1_8);
1345ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l9 = _mm_sub_epi16(stp2_6, stp1_9);
1346ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l10 = _mm_sub_epi16(stp2_5, stp2_10);
1347ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l11 = _mm_sub_epi16(stp2_4, stp2_11);
1348ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l12 = _mm_sub_epi16(stp2_3, stp2_12);
1349ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l13 = _mm_sub_epi16(stp2_2, stp2_13);
1350ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l14 = _mm_sub_epi16(stp2_1, stp1_14);
1351ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l15 = _mm_sub_epi16(stp2_0, stp1_15);
1352ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    } else if (i == 1) {
1353ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Right 8x16
1354ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r0 = _mm_add_epi16(stp2_0, stp1_15);
1355ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r1 = _mm_add_epi16(stp2_1, stp1_14);
1356ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r2 = _mm_add_epi16(stp2_2, stp2_13);
1357ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r3 = _mm_add_epi16(stp2_3, stp2_12);
1358ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r4 = _mm_add_epi16(stp2_4, stp2_11);
1359ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r5 = _mm_add_epi16(stp2_5, stp2_10);
1360ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r6 = _mm_add_epi16(stp2_6, stp1_9);
1361ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r7 = _mm_add_epi16(stp2_7, stp1_8);
1362ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r8 = _mm_sub_epi16(stp2_7, stp1_8);
1363ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r9 = _mm_sub_epi16(stp2_6, stp1_9);
1364ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r10 = _mm_sub_epi16(stp2_5, stp2_10);
1365ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r11 = _mm_sub_epi16(stp2_4, stp2_11);
1366ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r12 = _mm_sub_epi16(stp2_3, stp2_12);
1367ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r13 = _mm_sub_epi16(stp2_2, stp2_13);
1368ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r14 = _mm_sub_epi16(stp2_1, stp1_14);
1369ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r15 = _mm_sub_epi16(stp2_0, stp1_15);
1370ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    } else {
1371ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // 2-D
1372ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in0 = _mm_add_epi16(stp2_0, stp1_15);
1373ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in1 = _mm_add_epi16(stp2_1, stp1_14);
1374ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in2 = _mm_add_epi16(stp2_2, stp2_13);
1375ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in3 = _mm_add_epi16(stp2_3, stp2_12);
1376ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in4 = _mm_add_epi16(stp2_4, stp2_11);
1377ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in5 = _mm_add_epi16(stp2_5, stp2_10);
1378ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in6 = _mm_add_epi16(stp2_6, stp1_9);
1379ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in7 = _mm_add_epi16(stp2_7, stp1_8);
1380ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in8 = _mm_sub_epi16(stp2_7, stp1_8);
1381ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in9 = _mm_sub_epi16(stp2_6, stp1_9);
1382ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in10 = _mm_sub_epi16(stp2_5, stp2_10);
1383ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in11 = _mm_sub_epi16(stp2_4, stp2_11);
1384ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in12 = _mm_sub_epi16(stp2_3, stp2_12);
1385ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in13 = _mm_sub_epi16(stp2_2, stp2_13);
1386ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in14 = _mm_sub_epi16(stp2_1, stp1_14);
1387ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in15 = _mm_sub_epi16(stp2_0, stp1_15);
1388ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1389ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Final rounding and shift
1390ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in0 = _mm_adds_epi16(in0, final_rounding);
1391ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in1 = _mm_adds_epi16(in1, final_rounding);
1392ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in2 = _mm_adds_epi16(in2, final_rounding);
1393ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in3 = _mm_adds_epi16(in3, final_rounding);
1394ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in4 = _mm_adds_epi16(in4, final_rounding);
1395ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in5 = _mm_adds_epi16(in5, final_rounding);
1396ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in6 = _mm_adds_epi16(in6, final_rounding);
1397ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in7 = _mm_adds_epi16(in7, final_rounding);
1398ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in8 = _mm_adds_epi16(in8, final_rounding);
1399ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in9 = _mm_adds_epi16(in9, final_rounding);
1400ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in10 = _mm_adds_epi16(in10, final_rounding);
1401ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in11 = _mm_adds_epi16(in11, final_rounding);
1402ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in12 = _mm_adds_epi16(in12, final_rounding);
1403ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in13 = _mm_adds_epi16(in13, final_rounding);
1404ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in14 = _mm_adds_epi16(in14, final_rounding);
1405ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in15 = _mm_adds_epi16(in15, final_rounding);
1406ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1407ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in0 = _mm_srai_epi16(in0, 6);
1408ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in1 = _mm_srai_epi16(in1, 6);
1409ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in2 = _mm_srai_epi16(in2, 6);
1410ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in3 = _mm_srai_epi16(in3, 6);
1411ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in4 = _mm_srai_epi16(in4, 6);
1412ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in5 = _mm_srai_epi16(in5, 6);
1413ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in6 = _mm_srai_epi16(in6, 6);
1414ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in7 = _mm_srai_epi16(in7, 6);
1415ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in8 = _mm_srai_epi16(in8, 6);
1416ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in9 = _mm_srai_epi16(in9, 6);
1417ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in10 = _mm_srai_epi16(in10, 6);
1418ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in11 = _mm_srai_epi16(in11, 6);
1419ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in12 = _mm_srai_epi16(in12, 6);
1420ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in13 = _mm_srai_epi16(in13, 6);
1421ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in14 = _mm_srai_epi16(in14, 6);
1422ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in15 = _mm_srai_epi16(in15, 6);
1423ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1424ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in0);
1425ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in1);
1426ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in2);
1427ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in3);
1428ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in4);
1429ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in5);
1430ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in6);
1431ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in7);
1432ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in8);
1433ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in9);
1434ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in10);
1435ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in11);
1436ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in12);
1437ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in13);
1438ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in14);
1439ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in15);
1440ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1441ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      dest += 8 - (stride * 16);
1442ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    }
1443ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
1444ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
1445ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
14465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
1447f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  __m128i dc_value;
1448f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  const __m128i zero = _mm_setzero_si128();
1449f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  int a, i;
1450f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
1451f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  a = dct_const_round_shift(input[0] * cospi_16_64);
1452f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  a = dct_const_round_shift(a * cospi_16_64);
1453f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  a = ROUND_POWER_OF_TWO(a, 6);
1454f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
1455f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  dc_value = _mm_set1_epi16(a);
1456f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
1457f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  for (i = 0; i < 2; ++i) {
1458f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1459f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1460f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1461f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1462f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1463f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1464f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1465f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1466f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1467f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1468f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1469f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1470f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1471f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1472f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1473f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1474f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    dest += 8 - (stride * 16);
1475f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  }
1476f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang}
1477f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
147891037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
147991037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i tbuf[8];
148091037db265ecdd914a26e056cf69207b4f50924ehkuang  array_transpose_8x8(res0, res0);
148191037db265ecdd914a26e056cf69207b4f50924ehkuang  array_transpose_8x8(res1, tbuf);
148291037db265ecdd914a26e056cf69207b4f50924ehkuang  array_transpose_8x8(res0 + 8, res1);
148391037db265ecdd914a26e056cf69207b4f50924ehkuang  array_transpose_8x8(res1 + 8, res1 + 8);
148491037db265ecdd914a26e056cf69207b4f50924ehkuang
148591037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[8] = tbuf[0];
148691037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[9] = tbuf[1];
148791037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[10] = tbuf[2];
148891037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[11] = tbuf[3];
148991037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[12] = tbuf[4];
149091037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[13] = tbuf[5];
149191037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[14] = tbuf[6];
149291037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[15] = tbuf[7];
149391037db265ecdd914a26e056cf69207b4f50924ehkuang}
149491037db265ecdd914a26e056cf69207b4f50924ehkuang
14955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void iadst16_1d_8col(__m128i *in) {
149691037db265ecdd914a26e056cf69207b4f50924ehkuang  // perform 16x16 1-D ADST for 8 columns
149791037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i s[16], x[16], u[32], v[32];
149891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
149991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
150091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
150191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
150291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
150391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
150491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
150591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
150691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
150791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
150891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
150991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
151091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
151191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
151291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
151391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
151491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
151591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
151691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
151791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
151891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
151991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
152091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
152191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
152291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
152391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
152491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
152591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
152691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
152791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
152891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i kZero = _mm_set1_epi16(0);
152991037db265ecdd914a26e056cf69207b4f50924ehkuang
153091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
153191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
153291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
153391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
153491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
153591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
153691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
153791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
153891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
153991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
154091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
154191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
154291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
154391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
154491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
154591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
154691037db265ecdd914a26e056cf69207b4f50924ehkuang
154791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
154891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
154991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
155091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
155191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
155291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
155391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
155491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
155591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
155691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
155791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
155891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
155991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
156091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
156191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
156291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
156391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
156491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
156591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
156691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
156791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
156891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
156991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
157091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
157191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
157291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
157391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
157491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
157591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
157691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
157791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
157891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
157991037db265ecdd914a26e056cf69207b4f50924ehkuang
158091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], v[16]);
158191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], v[17]);
158291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], v[18]);
158391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], v[19]);
158491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(v[4], v[20]);
158591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(v[5], v[21]);
158691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(v[6], v[22]);
158791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(v[7], v[23]);
158891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_add_epi32(v[8], v[24]);
158991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_add_epi32(v[9], v[25]);
159091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_add_epi32(v[10], v[26]);
159191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_add_epi32(v[11], v[27]);
159291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_add_epi32(v[12], v[28]);
159391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_add_epi32(v[13], v[29]);
159491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_add_epi32(v[14], v[30]);
159591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_add_epi32(v[15], v[31]);
159691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[16] = _mm_sub_epi32(v[0], v[16]);
159791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[17] = _mm_sub_epi32(v[1], v[17]);
159891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[18] = _mm_sub_epi32(v[2], v[18]);
159991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[19] = _mm_sub_epi32(v[3], v[19]);
160091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[20] = _mm_sub_epi32(v[4], v[20]);
160191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[21] = _mm_sub_epi32(v[5], v[21]);
160291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[22] = _mm_sub_epi32(v[6], v[22]);
160391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[23] = _mm_sub_epi32(v[7], v[23]);
160491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[24] = _mm_sub_epi32(v[8], v[24]);
160591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[25] = _mm_sub_epi32(v[9], v[25]);
160691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[26] = _mm_sub_epi32(v[10], v[26]);
160791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[27] = _mm_sub_epi32(v[11], v[27]);
160891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[28] = _mm_sub_epi32(v[12], v[28]);
160991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[29] = _mm_sub_epi32(v[13], v[29]);
161091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[30] = _mm_sub_epi32(v[14], v[30]);
161191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[31] = _mm_sub_epi32(v[15], v[31]);
161291037db265ecdd914a26e056cf69207b4f50924ehkuang
161391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
161491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
161591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
161691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
161791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
161891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
161991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
162091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
162191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
162291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
162391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
162491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
162591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
162691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
162791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
162891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
162991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
163091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
163191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
163291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
163391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
163491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
163591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
163691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
163791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
163891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
163991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
164091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
164191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
164291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
164391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
164491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
164591037db265ecdd914a26e056cf69207b4f50924ehkuang
164691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
164791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
164891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
164991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
165091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
165191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
165291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
165391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
165491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
165591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
165691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
165791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
165891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
165991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
166091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
166191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
166291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
166391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
166491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
166591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
166691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
166791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
166891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
166991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
167091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
167191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
167291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
167391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
167491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
167591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
167691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
167791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
167891037db265ecdd914a26e056cf69207b4f50924ehkuang
167991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[0] = _mm_packs_epi32(u[0], u[1]);
168091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[1] = _mm_packs_epi32(u[2], u[3]);
168191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[2] = _mm_packs_epi32(u[4], u[5]);
168291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[3] = _mm_packs_epi32(u[6], u[7]);
168391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[4] = _mm_packs_epi32(u[8], u[9]);
168491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[5] = _mm_packs_epi32(u[10], u[11]);
168591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[6] = _mm_packs_epi32(u[12], u[13]);
168691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[7] = _mm_packs_epi32(u[14], u[15]);
168791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[8] = _mm_packs_epi32(u[16], u[17]);
168891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[9] = _mm_packs_epi32(u[18], u[19]);
168991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[10] = _mm_packs_epi32(u[20], u[21]);
169091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[11] = _mm_packs_epi32(u[22], u[23]);
169191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[12] = _mm_packs_epi32(u[24], u[25]);
169291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[13] = _mm_packs_epi32(u[26], u[27]);
169391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[14] = _mm_packs_epi32(u[28], u[29]);
169491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[15] = _mm_packs_epi32(u[30], u[31]);
169591037db265ecdd914a26e056cf69207b4f50924ehkuang
169691037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 2
169791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
169891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
169991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
170091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
170191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
170291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
170391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
170491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
170591037db265ecdd914a26e056cf69207b4f50924ehkuang
170691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
170791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
170891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
170991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
171091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
171191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
171291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
171391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
171491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
171591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
171691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
171791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
171891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
171991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
172091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
172191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
172291037db265ecdd914a26e056cf69207b4f50924ehkuang
172391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], v[8]);
172491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], v[9]);
172591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], v[10]);
172691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], v[11]);
172791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(v[4], v[12]);
172891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(v[5], v[13]);
172991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(v[6], v[14]);
173091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(v[7], v[15]);
173191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_sub_epi32(v[0], v[8]);
173291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_sub_epi32(v[1], v[9]);
173391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_sub_epi32(v[2], v[10]);
173491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_sub_epi32(v[3], v[11]);
173591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_sub_epi32(v[4], v[12]);
173691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_sub_epi32(v[5], v[13]);
173791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_sub_epi32(v[6], v[14]);
173891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_sub_epi32(v[7], v[15]);
173991037db265ecdd914a26e056cf69207b4f50924ehkuang
174091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
174191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
174291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
174391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
174491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
174591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
174691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
174791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
174891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
174991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
175091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
175191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
175291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
175391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
175491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
175591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
175691037db265ecdd914a26e056cf69207b4f50924ehkuang
175791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
175891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
175991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
176091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
176191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
176291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
176391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
176491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
176591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
176691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
176791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
176891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
176991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
177091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
177191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
177291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
177391037db265ecdd914a26e056cf69207b4f50924ehkuang
177491037db265ecdd914a26e056cf69207b4f50924ehkuang  x[0] = _mm_add_epi16(s[0], s[4]);
177591037db265ecdd914a26e056cf69207b4f50924ehkuang  x[1] = _mm_add_epi16(s[1], s[5]);
177691037db265ecdd914a26e056cf69207b4f50924ehkuang  x[2] = _mm_add_epi16(s[2], s[6]);
177791037db265ecdd914a26e056cf69207b4f50924ehkuang  x[3] = _mm_add_epi16(s[3], s[7]);
177891037db265ecdd914a26e056cf69207b4f50924ehkuang  x[4] = _mm_sub_epi16(s[0], s[4]);
177991037db265ecdd914a26e056cf69207b4f50924ehkuang  x[5] = _mm_sub_epi16(s[1], s[5]);
178091037db265ecdd914a26e056cf69207b4f50924ehkuang  x[6] = _mm_sub_epi16(s[2], s[6]);
178191037db265ecdd914a26e056cf69207b4f50924ehkuang  x[7] = _mm_sub_epi16(s[3], s[7]);
178291037db265ecdd914a26e056cf69207b4f50924ehkuang  x[8] = _mm_packs_epi32(u[0], u[1]);
178391037db265ecdd914a26e056cf69207b4f50924ehkuang  x[9] = _mm_packs_epi32(u[2], u[3]);
178491037db265ecdd914a26e056cf69207b4f50924ehkuang  x[10] = _mm_packs_epi32(u[4], u[5]);
178591037db265ecdd914a26e056cf69207b4f50924ehkuang  x[11] = _mm_packs_epi32(u[6], u[7]);
178691037db265ecdd914a26e056cf69207b4f50924ehkuang  x[12] = _mm_packs_epi32(u[8], u[9]);
178791037db265ecdd914a26e056cf69207b4f50924ehkuang  x[13] = _mm_packs_epi32(u[10], u[11]);
178891037db265ecdd914a26e056cf69207b4f50924ehkuang  x[14] = _mm_packs_epi32(u[12], u[13]);
178991037db265ecdd914a26e056cf69207b4f50924ehkuang  x[15] = _mm_packs_epi32(u[14], u[15]);
179091037db265ecdd914a26e056cf69207b4f50924ehkuang
179191037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 3
179291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
179391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
179491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
179591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
179691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
179791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
179891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
179991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
180091037db265ecdd914a26e056cf69207b4f50924ehkuang
180191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
180291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
180391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
180491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
180591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
180691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
180791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
180891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
180991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
181091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
181191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
181291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
181391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
181491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
181591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
181691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
181791037db265ecdd914a26e056cf69207b4f50924ehkuang
181891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], v[4]);
181991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], v[5]);
182091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], v[6]);
182191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], v[7]);
182291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_sub_epi32(v[0], v[4]);
182391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_sub_epi32(v[1], v[5]);
182491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_sub_epi32(v[2], v[6]);
182591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_sub_epi32(v[3], v[7]);
182691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_add_epi32(v[8], v[12]);
182791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_add_epi32(v[9], v[13]);
182891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_add_epi32(v[10], v[14]);
182991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_add_epi32(v[11], v[15]);
183091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_sub_epi32(v[8], v[12]);
183191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_sub_epi32(v[9], v[13]);
183291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_sub_epi32(v[10], v[14]);
183391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_sub_epi32(v[11], v[15]);
183491037db265ecdd914a26e056cf69207b4f50924ehkuang
183591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
183691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
183791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
183891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
183991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
184091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
184191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
184291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
184391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
184491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
184591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
184691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
184791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
184891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
184991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
185091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
185191037db265ecdd914a26e056cf69207b4f50924ehkuang
185291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
185391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
185491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
185591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
185691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
185791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
185891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
185991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
186091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
186191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
186291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
186391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
186491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
186591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
186691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
186791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
186891037db265ecdd914a26e056cf69207b4f50924ehkuang
186991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[0] = _mm_add_epi16(x[0], x[2]);
187091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[1] = _mm_add_epi16(x[1], x[3]);
187191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[2] = _mm_sub_epi16(x[0], x[2]);
187291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[3] = _mm_sub_epi16(x[1], x[3]);
187391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[4] = _mm_packs_epi32(v[0], v[1]);
187491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[5] = _mm_packs_epi32(v[2], v[3]);
187591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[6] = _mm_packs_epi32(v[4], v[5]);
187691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[7] = _mm_packs_epi32(v[6], v[7]);
187791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[8] = _mm_add_epi16(x[8], x[10]);
187891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[9] = _mm_add_epi16(x[9], x[11]);
187991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[10] = _mm_sub_epi16(x[8], x[10]);
188091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[11] = _mm_sub_epi16(x[9], x[11]);
188191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[12] = _mm_packs_epi32(v[8], v[9]);
188291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[13] = _mm_packs_epi32(v[10], v[11]);
188391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[14] = _mm_packs_epi32(v[12], v[13]);
188491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[15] = _mm_packs_epi32(v[14], v[15]);
188591037db265ecdd914a26e056cf69207b4f50924ehkuang
188691037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 4
188791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
188891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
188991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
189091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
189191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
189291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
189391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
189491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
189591037db265ecdd914a26e056cf69207b4f50924ehkuang
189691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
189791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
189891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
189991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
190091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
190191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
190291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
190391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
190491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
190591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
190691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
190791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
190891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
190991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
191091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
191191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
191291037db265ecdd914a26e056cf69207b4f50924ehkuang
191391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
191491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
191591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
191691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
191791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
191891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
191991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
192091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
192191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
192291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
192391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
192491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
192591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
192691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
192791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
192891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
192991037db265ecdd914a26e056cf69207b4f50924ehkuang
193091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
193191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
193291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
193391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
193491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
193591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
193691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
193791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
193891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
193991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
194091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
194191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
194291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
194391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
194491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
194591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
194691037db265ecdd914a26e056cf69207b4f50924ehkuang
194791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = s[0];
194891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_sub_epi16(kZero, s[8]);
194991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = s[12];
195091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_sub_epi16(kZero, s[4]);
195191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = _mm_packs_epi32(v[4], v[5]);
195291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_packs_epi32(v[12], v[13]);
195391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = _mm_packs_epi32(v[8], v[9]);
195491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_packs_epi32(v[0], v[1]);
195591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[8] = _mm_packs_epi32(v[2], v[3]);
195691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[9] = _mm_packs_epi32(v[10], v[11]);
195791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[10] = _mm_packs_epi32(v[14], v[15]);
195891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[11] = _mm_packs_epi32(v[6], v[7]);
195991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[12] = s[5];
196091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[13] = _mm_sub_epi16(kZero, s[13]);
196191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[14] = s[9];
196291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[15] = _mm_sub_epi16(kZero, s[1]);
196391037db265ecdd914a26e056cf69207b4f50924ehkuang}
196491037db265ecdd914a26e056cf69207b4f50924ehkuang
19655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void idct16_1d_8col(__m128i *in) {
196691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
196791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
196891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
196991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
197091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
197191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
197291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
197391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
197491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
197591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
197691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
197791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
197891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
197991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
198091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
198191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
198291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
198391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
198491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
198591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
198691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
198791037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i v[16], u[16], s[16], t[16];
198891037db265ecdd914a26e056cf69207b4f50924ehkuang
198991037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 1
199091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[0] = in[0];
199191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[1] = in[8];
199291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[2] = in[4];
199391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[3] = in[12];
199491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[4] = in[2];
199591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[5] = in[10];
199691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[6] = in[6];
199791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[7] = in[14];
199891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[8] = in[1];
199991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[9] = in[9];
200091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[10] = in[5];
200191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[11] = in[13];
200291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[12] = in[3];
200391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[13] = in[11];
200491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[14] = in[7];
200591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[15] = in[15];
200691037db265ecdd914a26e056cf69207b4f50924ehkuang
200791037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 2
200891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(s[8], s[15]);
200991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(s[8], s[15]);
201091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(s[9], s[14]);
201191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(s[9], s[14]);
201291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_unpacklo_epi16(s[10], s[13]);
201391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_unpackhi_epi16(s[10], s[13]);
201491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_unpacklo_epi16(s[11], s[12]);
201591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_unpackhi_epi16(s[11], s[12]);
201691037db265ecdd914a26e056cf69207b4f50924ehkuang
201791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
201891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
201991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
202091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
202191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
202291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
202391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
202491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
202591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
202691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
202791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
202891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
202991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
203091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
203191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
203291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
203391037db265ecdd914a26e056cf69207b4f50924ehkuang
203491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
203591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
203691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
203791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
203891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
203991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
204091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
204191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
204291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
204391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
204491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
204591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
204691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
204791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
204891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
204991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
205091037db265ecdd914a26e056cf69207b4f50924ehkuang
205191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
205291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
205391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
205491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
205591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
205691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
205791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
205891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
205991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
206091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
206191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
206291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
206391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
206491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
206591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
206691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
206791037db265ecdd914a26e056cf69207b4f50924ehkuang
206891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[8]  = _mm_packs_epi32(u[0], u[1]);
206991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[15] = _mm_packs_epi32(u[2], u[3]);
207091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[9]  = _mm_packs_epi32(u[4], u[5]);
207191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[14] = _mm_packs_epi32(u[6], u[7]);
207291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[10] = _mm_packs_epi32(u[8], u[9]);
207391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[13] = _mm_packs_epi32(u[10], u[11]);
207491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[11] = _mm_packs_epi32(u[12], u[13]);
207591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[12] = _mm_packs_epi32(u[14], u[15]);
207691037db265ecdd914a26e056cf69207b4f50924ehkuang
207791037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 3
207891037db265ecdd914a26e056cf69207b4f50924ehkuang  t[0] = s[0];
207991037db265ecdd914a26e056cf69207b4f50924ehkuang  t[1] = s[1];
208091037db265ecdd914a26e056cf69207b4f50924ehkuang  t[2] = s[2];
208191037db265ecdd914a26e056cf69207b4f50924ehkuang  t[3] = s[3];
208291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(s[4], s[7]);
208391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(s[4], s[7]);
208491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(s[5], s[6]);
208591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(s[5], s[6]);
208691037db265ecdd914a26e056cf69207b4f50924ehkuang
208791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
208891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
208991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
209091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
209191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
209291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
209391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
209491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
209591037db265ecdd914a26e056cf69207b4f50924ehkuang
209691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
209791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
209891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
209991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
210091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
210191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
210291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
210391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
210491037db265ecdd914a26e056cf69207b4f50924ehkuang
210591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
210691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
210791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
210891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
210991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
211091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
211191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
211291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
211391037db265ecdd914a26e056cf69207b4f50924ehkuang
211491037db265ecdd914a26e056cf69207b4f50924ehkuang  t[4] = _mm_packs_epi32(u[0], u[1]);
211591037db265ecdd914a26e056cf69207b4f50924ehkuang  t[7] = _mm_packs_epi32(u[2], u[3]);
211691037db265ecdd914a26e056cf69207b4f50924ehkuang  t[5] = _mm_packs_epi32(u[4], u[5]);
211791037db265ecdd914a26e056cf69207b4f50924ehkuang  t[6] = _mm_packs_epi32(u[6], u[7]);
211891037db265ecdd914a26e056cf69207b4f50924ehkuang  t[8] = _mm_add_epi16(s[8], s[9]);
211991037db265ecdd914a26e056cf69207b4f50924ehkuang  t[9] = _mm_sub_epi16(s[8], s[9]);
212091037db265ecdd914a26e056cf69207b4f50924ehkuang  t[10] = _mm_sub_epi16(s[11], s[10]);
212191037db265ecdd914a26e056cf69207b4f50924ehkuang  t[11] = _mm_add_epi16(s[10], s[11]);
212291037db265ecdd914a26e056cf69207b4f50924ehkuang  t[12] = _mm_add_epi16(s[12], s[13]);
212391037db265ecdd914a26e056cf69207b4f50924ehkuang  t[13] = _mm_sub_epi16(s[12], s[13]);
212491037db265ecdd914a26e056cf69207b4f50924ehkuang  t[14] = _mm_sub_epi16(s[15], s[14]);
212591037db265ecdd914a26e056cf69207b4f50924ehkuang  t[15] = _mm_add_epi16(s[14], s[15]);
212691037db265ecdd914a26e056cf69207b4f50924ehkuang
212791037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 4
212891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(t[0], t[1]);
212991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(t[0], t[1]);
213091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(t[2], t[3]);
213191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(t[2], t[3]);
213291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_unpacklo_epi16(t[9], t[14]);
213391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_unpackhi_epi16(t[9], t[14]);
213491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_unpacklo_epi16(t[10], t[13]);
213591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_unpackhi_epi16(t[10], t[13]);
213691037db265ecdd914a26e056cf69207b4f50924ehkuang
213791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
213891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
213991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
214091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
214191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
214291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
214391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
214491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
214591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
214691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
214791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
214891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
214991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
215091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
215191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
215291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
215391037db265ecdd914a26e056cf69207b4f50924ehkuang
215491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
215591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
215691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
215791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
215891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
215991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
216091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
216191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
216291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
216391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
216491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
216591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
216691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
216791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
216891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
216991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
217091037db265ecdd914a26e056cf69207b4f50924ehkuang
217191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
217291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
217391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
217491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
217591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
217691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
217791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
217891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
217991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
218091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
218191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
218291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
218391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
218491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
218591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
218691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
218791037db265ecdd914a26e056cf69207b4f50924ehkuang
218891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[0] = _mm_packs_epi32(u[0], u[1]);
218991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[1] = _mm_packs_epi32(u[2], u[3]);
219091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[2] = _mm_packs_epi32(u[4], u[5]);
219191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[3] = _mm_packs_epi32(u[6], u[7]);
219291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[4] = _mm_add_epi16(t[4], t[5]);
219391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[5] = _mm_sub_epi16(t[4], t[5]);
219491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[6] = _mm_sub_epi16(t[7], t[6]);
219591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[7] = _mm_add_epi16(t[6], t[7]);
219691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[8] = t[8];
219791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[15] = t[15];
219891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[9]  = _mm_packs_epi32(u[8], u[9]);
219991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[14] = _mm_packs_epi32(u[10], u[11]);
220091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[10] = _mm_packs_epi32(u[12], u[13]);
220191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[13] = _mm_packs_epi32(u[14], u[15]);
220291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[11] = t[11];
220391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[12] = t[12];
220491037db265ecdd914a26e056cf69207b4f50924ehkuang
220591037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 5
220691037db265ecdd914a26e056cf69207b4f50924ehkuang  t[0] = _mm_add_epi16(s[0], s[3]);
220791037db265ecdd914a26e056cf69207b4f50924ehkuang  t[1] = _mm_add_epi16(s[1], s[2]);
220891037db265ecdd914a26e056cf69207b4f50924ehkuang  t[2] = _mm_sub_epi16(s[1], s[2]);
220991037db265ecdd914a26e056cf69207b4f50924ehkuang  t[3] = _mm_sub_epi16(s[0], s[3]);
221091037db265ecdd914a26e056cf69207b4f50924ehkuang  t[4] = s[4];
221191037db265ecdd914a26e056cf69207b4f50924ehkuang  t[7] = s[7];
221291037db265ecdd914a26e056cf69207b4f50924ehkuang
221391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(s[5], s[6]);
221491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(s[5], s[6]);
221591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
221691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
221791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
221891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
221991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
222091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
222191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
222291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
222391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
222491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
222591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
222691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
222791037db265ecdd914a26e056cf69207b4f50924ehkuang  t[5] = _mm_packs_epi32(u[0], u[1]);
222891037db265ecdd914a26e056cf69207b4f50924ehkuang  t[6] = _mm_packs_epi32(u[2], u[3]);
222991037db265ecdd914a26e056cf69207b4f50924ehkuang
223091037db265ecdd914a26e056cf69207b4f50924ehkuang  t[8] = _mm_add_epi16(s[8], s[11]);
223191037db265ecdd914a26e056cf69207b4f50924ehkuang  t[9] = _mm_add_epi16(s[9], s[10]);
223291037db265ecdd914a26e056cf69207b4f50924ehkuang  t[10] = _mm_sub_epi16(s[9], s[10]);
223391037db265ecdd914a26e056cf69207b4f50924ehkuang  t[11] = _mm_sub_epi16(s[8], s[11]);
223491037db265ecdd914a26e056cf69207b4f50924ehkuang  t[12] = _mm_sub_epi16(s[15], s[12]);
223591037db265ecdd914a26e056cf69207b4f50924ehkuang  t[13] = _mm_sub_epi16(s[14], s[13]);
223691037db265ecdd914a26e056cf69207b4f50924ehkuang  t[14] = _mm_add_epi16(s[13], s[14]);
223791037db265ecdd914a26e056cf69207b4f50924ehkuang  t[15] = _mm_add_epi16(s[12], s[15]);
223891037db265ecdd914a26e056cf69207b4f50924ehkuang
223991037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 6
224091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[0] = _mm_add_epi16(t[0], t[7]);
224191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[1] = _mm_add_epi16(t[1], t[6]);
224291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[2] = _mm_add_epi16(t[2], t[5]);
224391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[3] = _mm_add_epi16(t[3], t[4]);
224491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[4] = _mm_sub_epi16(t[3], t[4]);
224591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[5] = _mm_sub_epi16(t[2], t[5]);
224691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[6] = _mm_sub_epi16(t[1], t[6]);
224791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[7] = _mm_sub_epi16(t[0], t[7]);
224891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[8] = t[8];
224991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[9] = t[9];
225091037db265ecdd914a26e056cf69207b4f50924ehkuang
225191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(t[10], t[13]);
225291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(t[10], t[13]);
225391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(t[11], t[12]);
225491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(t[11], t[12]);
225591037db265ecdd914a26e056cf69207b4f50924ehkuang
225691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
225791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
225891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
225991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
226091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
226191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
226291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
226391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
226491037db265ecdd914a26e056cf69207b4f50924ehkuang
226591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
226691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
226791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
226891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
226991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
227091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
227191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
227291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
227391037db265ecdd914a26e056cf69207b4f50924ehkuang
227491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
227591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
227691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
227791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
227891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
227991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
228091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
228191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
228291037db265ecdd914a26e056cf69207b4f50924ehkuang
228391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[10] = _mm_packs_epi32(u[0], u[1]);
228491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[13] = _mm_packs_epi32(u[2], u[3]);
228591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[11] = _mm_packs_epi32(u[4], u[5]);
228691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[12] = _mm_packs_epi32(u[6], u[7]);
228791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[14] = t[14];
228891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[15] = t[15];
228991037db265ecdd914a26e056cf69207b4f50924ehkuang
229091037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 7
229191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_add_epi16(s[0], s[15]);
229291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_add_epi16(s[1], s[14]);
229391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_add_epi16(s[2], s[13]);
229491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_add_epi16(s[3], s[12]);
229591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = _mm_add_epi16(s[4], s[11]);
229691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_add_epi16(s[5], s[10]);
229791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = _mm_add_epi16(s[6], s[9]);
229891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_add_epi16(s[7], s[8]);
229991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[8] = _mm_sub_epi16(s[7], s[8]);
230091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[9] = _mm_sub_epi16(s[6], s[9]);
230191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[10] = _mm_sub_epi16(s[5], s[10]);
230291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[11] = _mm_sub_epi16(s[4], s[11]);
230391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[12] = _mm_sub_epi16(s[3], s[12]);
230491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[13] = _mm_sub_epi16(s[2], s[13]);
230591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[14] = _mm_sub_epi16(s[1], s[14]);
230691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[15] = _mm_sub_epi16(s[0], s[15]);
230791037db265ecdd914a26e056cf69207b4f50924ehkuang}
230891037db265ecdd914a26e056cf69207b4f50924ehkuang
23095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void idct16_1d_sse2(__m128i *in0, __m128i *in1) {
231091037db265ecdd914a26e056cf69207b4f50924ehkuang  array_transpose_16x16(in0, in1);
231191037db265ecdd914a26e056cf69207b4f50924ehkuang  idct16_1d_8col(in0);
231291037db265ecdd914a26e056cf69207b4f50924ehkuang  idct16_1d_8col(in1);
231391037db265ecdd914a26e056cf69207b4f50924ehkuang}
231491037db265ecdd914a26e056cf69207b4f50924ehkuang
23155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void iadst16_1d_sse2(__m128i *in0, __m128i *in1) {
231691037db265ecdd914a26e056cf69207b4f50924ehkuang  array_transpose_16x16(in0, in1);
231791037db265ecdd914a26e056cf69207b4f50924ehkuang  iadst16_1d_8col(in0);
231891037db265ecdd914a26e056cf69207b4f50924ehkuang  iadst16_1d_8col(in1);
231991037db265ecdd914a26e056cf69207b4f50924ehkuang}
232091037db265ecdd914a26e056cf69207b4f50924ehkuang
23215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
23225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * 16));
23235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * 16));
23245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * 16));
23255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * 16));
23265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * 16));
23275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * 16));
23285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * 16));
23295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * 16));
23305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
23315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[8]  = _mm_load_si128((const __m128i *)(input + 8 * 16));
23325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[9]  = _mm_load_si128((const __m128i *)(input + 9 * 16));
23335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[10]  = _mm_load_si128((const __m128i *)(input + 10 * 16));
23345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[11]  = _mm_load_si128((const __m128i *)(input + 11 * 16));
23355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[12]  = _mm_load_si128((const __m128i *)(input + 12 * 16));
23365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[13]  = _mm_load_si128((const __m128i *)(input + 13 * 16));
23375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[14]  = _mm_load_si128((const __m128i *)(input + 14 * 16));
23385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[15]  = _mm_load_si128((const __m128i *)(input + 15 * 16));
233991037db265ecdd914a26e056cf69207b4f50924ehkuang}
234091037db265ecdd914a26e056cf69207b4f50924ehkuang
234191037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
234291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i final_rounding = _mm_set1_epi16(1<<5);
234391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i zero = _mm_setzero_si128();
234491037db265ecdd914a26e056cf69207b4f50924ehkuang  // Final rounding and shift
234591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_adds_epi16(in[0], final_rounding);
234691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_adds_epi16(in[1], final_rounding);
234791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_adds_epi16(in[2], final_rounding);
234891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_adds_epi16(in[3], final_rounding);
234991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = _mm_adds_epi16(in[4], final_rounding);
235091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_adds_epi16(in[5], final_rounding);
235191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = _mm_adds_epi16(in[6], final_rounding);
235291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_adds_epi16(in[7], final_rounding);
235391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[8] = _mm_adds_epi16(in[8], final_rounding);
235491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[9] = _mm_adds_epi16(in[9], final_rounding);
235591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[10] = _mm_adds_epi16(in[10], final_rounding);
235691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[11] = _mm_adds_epi16(in[11], final_rounding);
235791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[12] = _mm_adds_epi16(in[12], final_rounding);
235891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[13] = _mm_adds_epi16(in[13], final_rounding);
235991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[14] = _mm_adds_epi16(in[14], final_rounding);
236091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[15] = _mm_adds_epi16(in[15], final_rounding);
236191037db265ecdd914a26e056cf69207b4f50924ehkuang
236291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_srai_epi16(in[0], 6);
236391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_srai_epi16(in[1], 6);
236491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_srai_epi16(in[2], 6);
236591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_srai_epi16(in[3], 6);
236691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = _mm_srai_epi16(in[4], 6);
236791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_srai_epi16(in[5], 6);
236891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = _mm_srai_epi16(in[6], 6);
236991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_srai_epi16(in[7], 6);
237091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[8] = _mm_srai_epi16(in[8], 6);
237191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[9] = _mm_srai_epi16(in[9], 6);
237291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[10] = _mm_srai_epi16(in[10], 6);
237391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[11] = _mm_srai_epi16(in[11], 6);
237491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[12] = _mm_srai_epi16(in[12], 6);
237591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[13] = _mm_srai_epi16(in[13], 6);
237691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[14] = _mm_srai_epi16(in[14], 6);
237791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[15] = _mm_srai_epi16(in[15], 6);
237891037db265ecdd914a26e056cf69207b4f50924ehkuang
237991037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[0]);
238091037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[1]);
238191037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[2]);
238291037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[3]);
238391037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[4]);
238491037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[5]);
238591037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[6]);
238691037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[7]);
238791037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[8]);
238891037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[9]);
238991037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[10]);
239091037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[11]);
239191037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[12]);
239291037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[13]);
239391037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[14]);
239491037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[15]);
239591037db265ecdd914a26e056cf69207b4f50924ehkuang}
239691037db265ecdd914a26e056cf69207b4f50924ehkuang
23975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
23985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                               int tx_type) {
239991037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i in0[16], in1[16];
240091037db265ecdd914a26e056cf69207b4f50924ehkuang
240191037db265ecdd914a26e056cf69207b4f50924ehkuang  load_buffer_8x16(input, in0);
240291037db265ecdd914a26e056cf69207b4f50924ehkuang  input += 8;
240391037db265ecdd914a26e056cf69207b4f50924ehkuang  load_buffer_8x16(input, in1);
240491037db265ecdd914a26e056cf69207b4f50924ehkuang
240591037db265ecdd914a26e056cf69207b4f50924ehkuang  switch (tx_type) {
240691037db265ecdd914a26e056cf69207b4f50924ehkuang    case 0:  // DCT_DCT
240791037db265ecdd914a26e056cf69207b4f50924ehkuang      idct16_1d_sse2(in0, in1);
240891037db265ecdd914a26e056cf69207b4f50924ehkuang      idct16_1d_sse2(in0, in1);
240991037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
241091037db265ecdd914a26e056cf69207b4f50924ehkuang    case 1:  // ADST_DCT
241191037db265ecdd914a26e056cf69207b4f50924ehkuang      idct16_1d_sse2(in0, in1);
241291037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst16_1d_sse2(in0, in1);
241391037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
241491037db265ecdd914a26e056cf69207b4f50924ehkuang    case 2:  // DCT_ADST
241591037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst16_1d_sse2(in0, in1);
241691037db265ecdd914a26e056cf69207b4f50924ehkuang      idct16_1d_sse2(in0, in1);
241791037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
241891037db265ecdd914a26e056cf69207b4f50924ehkuang    case 3:  // ADST_ADST
241991037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst16_1d_sse2(in0, in1);
242091037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst16_1d_sse2(in0, in1);
242191037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
242291037db265ecdd914a26e056cf69207b4f50924ehkuang    default:
242391037db265ecdd914a26e056cf69207b4f50924ehkuang      assert(0);
242491037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
242591037db265ecdd914a26e056cf69207b4f50924ehkuang  }
242691037db265ecdd914a26e056cf69207b4f50924ehkuang
242791037db265ecdd914a26e056cf69207b4f50924ehkuang  write_buffer_8x16(dest, in0, stride);
242891037db265ecdd914a26e056cf69207b4f50924ehkuang  dest += 8;
242991037db265ecdd914a26e056cf69207b4f50924ehkuang  write_buffer_8x16(dest, in1, stride);
243091037db265ecdd914a26e056cf69207b4f50924ehkuang}
243191037db265ecdd914a26e056cf69207b4f50924ehkuang
24325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
24335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                               int stride) {
2434ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
2435ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i final_rounding = _mm_set1_epi16(1<<5);
2436ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i zero = _mm_setzero_si128();
2437ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2438ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2439ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
2440ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
2441ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
2442ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
2443ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
2444ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2445ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
2446ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2447ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2448ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
2449ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
2450ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
2451ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2452ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
2453ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2454ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
2455ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
2456ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2457ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
2458ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
2459ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2460ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2461ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
2462ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2463ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero,
2464ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero,
2465ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          in10 = zero, in11 = zero, in12 = zero, in13 = zero,
2466ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          in14 = zero, in15 = zero;
2467ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero,
2468ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero,
2469ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          l12 = zero, l13 = zero, l14 = zero, l15 = zero;
2470ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2471ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
2472ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
2473ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_8_0, stp1_12_0;
2474ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
2475ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
2476ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2477ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int i;
2478ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // 1-D idct. Load input data.
24795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in0 = _mm_load_si128((const __m128i *)input);
24805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in8 = _mm_load_si128((const __m128i *)(input + 8 * 1));
24815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in1 = _mm_load_si128((const __m128i *)(input + 8 * 2));
24825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in9 = _mm_load_si128((const __m128i *)(input + 8 * 3));
24835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in2 = _mm_load_si128((const __m128i *)(input + 8 * 4));
24845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in10 = _mm_load_si128((const __m128i *)(input + 8 * 5));
24855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in3 = _mm_load_si128((const __m128i *)(input + 8 * 6));
24865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in11 = _mm_load_si128((const __m128i *)(input + 8 * 7));
2487ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2488ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3);
2489ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11);
2490ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2491ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage2
2492ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {
2493ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_1_15 = _mm_unpackhi_epi16(in0, in11);
2494ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_9_7 = _mm_unpackhi_epi16(in8, in3);
2495ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_5_11 = _mm_unpackhi_epi16(in2, in9);
2496ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_13_3 = _mm_unpackhi_epi16(in10, in1);
2497ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2498ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
2499ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
2500ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_madd_epi16(lo_9_7, stg2_2);
2501ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_madd_epi16(lo_9_7, stg2_3);
2502ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_madd_epi16(lo_5_11, stg2_4);
2503ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_madd_epi16(lo_5_11, stg2_5);
2504ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
2505ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
2506ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2507ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding);
2508ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding);
2509ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_add_epi32(tmp4, rounding);
2510ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_add_epi32(tmp6, rounding);
2511ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_add_epi32(tmp1, rounding);
2512ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_add_epi32(tmp3, rounding);
2513ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp5 = _mm_add_epi32(tmp5, rounding);
2514ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp7 = _mm_add_epi32(tmp7, rounding);
2515ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2516ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2517ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2518ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2519ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2520ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2521ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2522ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2523ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2524ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2525ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_8 = _mm_packs_epi32(tmp0, zero);
2526ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_15 = _mm_packs_epi32(tmp2, zero);
2527ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_9 = _mm_packs_epi32(tmp4, zero);
2528ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_14 = _mm_packs_epi32(tmp6, zero);
2529ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2530ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_10 = _mm_packs_epi32(tmp1, zero);
2531ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_13 = _mm_packs_epi32(tmp3, zero);
2532ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_11 = _mm_packs_epi32(tmp5, zero);
2533ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_12 = _mm_packs_epi32(tmp7, zero);
2534ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
2535ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2536ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage3
2537ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {
2538ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_2_14 = _mm_unpacklo_epi16(in1, in11);
2539ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_10_6 = _mm_unpacklo_epi16(in9, in3);
2540ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2541ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
2542ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
2543ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_madd_epi16(lo_10_6, stg3_2);
2544ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_madd_epi16(lo_10_6, stg3_3);
2545ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2546ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding);
2547ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding);
2548ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_add_epi32(tmp4, rounding);
2549ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_add_epi32(tmp6, rounding);
2550ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2551ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2552ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2553ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2554ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2555ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2556ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_4 = _mm_packs_epi32(tmp0, zero);
2557ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_7 = _mm_packs_epi32(tmp2, zero);
2558ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_5 = _mm_packs_epi32(tmp4, zero);
2559ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_6 = _mm_packs_epi32(tmp6, zero);
2560ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2561ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);
2562ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);
2563ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);
2564ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_11 = _mm_add_epi16(stp2_11, stp2_10);
2565ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2566ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13);
2567ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);
2568ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);
2569ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_15 = _mm_add_epi16(stp2_15, stp2_14);
2570ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
2571ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2572ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage4
2573ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {
2574ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8);
2575ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_4_12 = _mm_unpacklo_epi16(in2, in10);
2576ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
2577ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
2578ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2579ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
2580ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
2581ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_madd_epi16(lo_4_12, stg4_2);
2582ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_madd_epi16(lo_4_12, stg4_3);
2583ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
2584ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
2585ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
2586ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
2587ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2588ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding);
2589ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding);
2590ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_add_epi32(tmp4, rounding);
2591ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_add_epi32(tmp6, rounding);
2592ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_add_epi32(tmp1, rounding);
2593ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_add_epi32(tmp3, rounding);
2594ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp5 = _mm_add_epi32(tmp5, rounding);
2595ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp7 = _mm_add_epi32(tmp7, rounding);
2596ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2597ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2598ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2599ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2600ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2601ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2602ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2603ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2604ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2605ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2606ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_0 = _mm_packs_epi32(tmp0, zero);
2607ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_1 = _mm_packs_epi32(tmp2, zero);
2608ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_2 = _mm_packs_epi32(tmp4, zero);
2609ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_3 = _mm_packs_epi32(tmp6, zero);
2610ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_9 = _mm_packs_epi32(tmp1, zero);
2611ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_14 = _mm_packs_epi32(tmp3, zero);
2612ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_10 = _mm_packs_epi32(tmp5, zero);
2613ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_13 = _mm_packs_epi32(tmp7, zero);
2614ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2615ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
2616ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
2617ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
2618ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
2619ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
2620ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2621ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage5 and Stage6
2622ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {
2623ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
2624ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
2625ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
2626ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
2627ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2628ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);
2629ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
2630ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
2631ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);
2632ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2633ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);
2634ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
2635ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
2636ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);
2637ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
2638ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2639ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage6
2640ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {
2641ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);
2642ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
2643ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
2644ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2645ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
2646ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
2647ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
2648ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
2649ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
2650ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
2651ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2652ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_add_epi32(tmp1, rounding);
2653ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_add_epi32(tmp3, rounding);
2654ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding);
2655ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding);
2656ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_add_epi32(tmp4, rounding);
2657ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_add_epi32(tmp6, rounding);
2658ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2659ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2660ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2661ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2662ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2663ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2664ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2665ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2666ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_5 = _mm_packs_epi32(tmp1, zero);
2667ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_6 = _mm_packs_epi32(tmp3, zero);
2668ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_10 = _mm_packs_epi32(tmp0, zero);
2669ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_13 = _mm_packs_epi32(tmp2, zero);
2670ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_11 = _mm_packs_epi32(tmp4, zero);
2671ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_12 = _mm_packs_epi32(tmp6, zero);
2672ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2673ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);
2674ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);
2675ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
2676ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);
2677ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);
2678ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
2679ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);
2680ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);
2681ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
2682ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2683ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage7. Left 8x16 only.
2684ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l0 = _mm_add_epi16(stp2_0, stp1_15);
2685ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l1 = _mm_add_epi16(stp2_1, stp1_14);
2686ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l2 = _mm_add_epi16(stp2_2, stp2_13);
2687ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l3 = _mm_add_epi16(stp2_3, stp2_12);
2688ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l4 = _mm_add_epi16(stp2_4, stp2_11);
2689ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l5 = _mm_add_epi16(stp2_5, stp2_10);
2690ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l6 = _mm_add_epi16(stp2_6, stp1_9);
2691ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l7 = _mm_add_epi16(stp2_7, stp1_8);
2692ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l8 = _mm_sub_epi16(stp2_7, stp1_8);
2693ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l9 = _mm_sub_epi16(stp2_6, stp1_9);
2694ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l10 = _mm_sub_epi16(stp2_5, stp2_10);
2695ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l11 = _mm_sub_epi16(stp2_4, stp2_11);
2696ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l12 = _mm_sub_epi16(stp2_3, stp2_12);
2697ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l13 = _mm_sub_epi16(stp2_2, stp2_13);
2698ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l14 = _mm_sub_epi16(stp2_1, stp1_14);
2699ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l15 = _mm_sub_epi16(stp2_0, stp1_15);
2700ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2701ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // 2-D idct. We do 2 8x16 blocks.
2702ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 2; i++) {
2703ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    if (i == 0)
2704ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_4X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4,
2705ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in5, in6, in7);
2706ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2707ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    if (i == 1)
2708ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_4X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3,
2709ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in4, in5, in6, in7);
2710ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2711ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero;
2712ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
27135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    IDCT16_1D
2714ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2715ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // Stage7
2716ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in0 = _mm_add_epi16(stp2_0, stp1_15);
2717ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in1 = _mm_add_epi16(stp2_1, stp1_14);
2718ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in2 = _mm_add_epi16(stp2_2, stp2_13);
2719ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in3 = _mm_add_epi16(stp2_3, stp2_12);
2720ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in4 = _mm_add_epi16(stp2_4, stp2_11);
2721ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in5 = _mm_add_epi16(stp2_5, stp2_10);
2722ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in6 = _mm_add_epi16(stp2_6, stp1_9);
2723ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in7 = _mm_add_epi16(stp2_7, stp1_8);
2724ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in8 = _mm_sub_epi16(stp2_7, stp1_8);
2725ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in9 = _mm_sub_epi16(stp2_6, stp1_9);
2726ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in10 = _mm_sub_epi16(stp2_5, stp2_10);
2727ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in11 = _mm_sub_epi16(stp2_4, stp2_11);
2728ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in12 = _mm_sub_epi16(stp2_3, stp2_12);
2729ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in13 = _mm_sub_epi16(stp2_2, stp2_13);
2730ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in14 = _mm_sub_epi16(stp2_1, stp1_14);
2731ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in15 = _mm_sub_epi16(stp2_0, stp1_15);
2732ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2733ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // Final rounding and shift
2734ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in0 = _mm_adds_epi16(in0, final_rounding);
2735ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in1 = _mm_adds_epi16(in1, final_rounding);
2736ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in2 = _mm_adds_epi16(in2, final_rounding);
2737ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in3 = _mm_adds_epi16(in3, final_rounding);
2738ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in4 = _mm_adds_epi16(in4, final_rounding);
2739ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in5 = _mm_adds_epi16(in5, final_rounding);
2740ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in6 = _mm_adds_epi16(in6, final_rounding);
2741ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in7 = _mm_adds_epi16(in7, final_rounding);
2742ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in8 = _mm_adds_epi16(in8, final_rounding);
2743ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in9 = _mm_adds_epi16(in9, final_rounding);
2744ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in10 = _mm_adds_epi16(in10, final_rounding);
2745ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in11 = _mm_adds_epi16(in11, final_rounding);
2746ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in12 = _mm_adds_epi16(in12, final_rounding);
2747ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in13 = _mm_adds_epi16(in13, final_rounding);
2748ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in14 = _mm_adds_epi16(in14, final_rounding);
2749ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in15 = _mm_adds_epi16(in15, final_rounding);
2750ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2751ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in0 = _mm_srai_epi16(in0, 6);
2752ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in1 = _mm_srai_epi16(in1, 6);
2753ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in2 = _mm_srai_epi16(in2, 6);
2754ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in3 = _mm_srai_epi16(in3, 6);
2755ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in4 = _mm_srai_epi16(in4, 6);
2756ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in5 = _mm_srai_epi16(in5, 6);
2757ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in6 = _mm_srai_epi16(in6, 6);
2758ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in7 = _mm_srai_epi16(in7, 6);
2759ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in8 = _mm_srai_epi16(in8, 6);
2760ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in9 = _mm_srai_epi16(in9, 6);
2761ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in10 = _mm_srai_epi16(in10, 6);
2762ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in11 = _mm_srai_epi16(in11, 6);
2763ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in12 = _mm_srai_epi16(in12, 6);
2764ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in13 = _mm_srai_epi16(in13, 6);
2765ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in14 = _mm_srai_epi16(in14, 6);
2766ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in15 = _mm_srai_epi16(in15, 6);
2767ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2768ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in0);
2769ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in1);
2770ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in2);
2771ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in3);
2772ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in4);
2773ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in5);
2774ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in6);
2775ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in7);
2776ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in8);
2777ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in9);
2778ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in10);
2779ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in11);
2780ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in12);
2781ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in13);
2782ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in14);
2783ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in15);
2784ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2785ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    dest += 8 - (stride * 16);
2786ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
2787ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
2788ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2789f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang#define LOAD_DQCOEFF(reg, input) \
2790f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  {  \
27915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    reg = _mm_load_si128((const __m128i *) input); \
2792f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    input += 8; \
2793f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  }  \
2794f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
27955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#define IDCT32_1D \
27965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/* Stage1 */ \
27975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang{ \
27985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); \
27995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); \
28005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); \
28015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); \
28025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
28035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); \
28045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); \
28055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); \
28065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); \
28075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
28085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); \
28095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); \
28105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); \
28115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); \
28125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
28135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); \
28145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); \
28155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); \
28165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); \
28175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
28185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
28195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
28205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_17, stp1_30) \
28215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
28225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
28235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_19, stp1_28) \
28245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
28255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
28265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_21, stp1_26) \
28275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
28285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
28295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_23, stp1_24) \
28305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} \
28315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang\
28325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/* Stage2 */ \
28335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang{ \
28345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); \
28355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); \
28365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); \
28375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); \
28385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
28395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); \
28405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); \
28415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); \
28425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); \
28435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
28445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
28455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
28465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp2_14) \
28475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
28485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
28495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp2_11, stp2_12) \
28505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
28515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
28525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
28535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
28545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
28555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
28565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
28575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
28585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
28595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
28605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
28615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
28625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
28635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
28645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
28655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
28665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
28675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
28685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
28695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
28705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} \
28715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang\
28725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/* Stage3 */ \
28735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang{ \
28745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); \
28755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); \
28765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); \
28775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); \
28785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
28795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
28805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
28815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
28825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
28835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
28845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
28855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
28865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
28875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
28885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
28895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
28905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
28915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_6) \
28925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
28935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
28945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
28955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
28965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
28975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
28985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
28995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
29005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
29015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
29025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
29035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
29045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_18, stp1_29) \
29055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
29065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
29075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_22, stp1_25) \
29085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
29095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_16 = stp2_16; \
29105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_31 = stp2_31; \
29115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_19 = stp2_19; \
29125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_20 = stp2_20; \
29135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_23 = stp2_23; \
29145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_24 = stp2_24; \
29155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_27 = stp2_27; \
29165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_28 = stp2_28; \
29175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} \
29185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang\
29195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/* Stage4 */ \
29205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang{ \
29215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); \
29225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); \
29235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); \
29245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); \
29255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
29265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
29275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
29285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
29295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
29305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
29315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
29325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
29335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp2_2, stp2_3) \
29345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
29355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
29365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
29375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
29385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
29395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
29405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
29415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
29425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp2_10, stp2_13) \
29435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
29445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_8 = stp1_8; \
29455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_15 = stp1_15; \
29465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_11 = stp1_11; \
29475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_12 = stp1_12; \
29485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
29495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
29505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
29515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
29525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
29535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
29545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
29555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
29565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
29575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
29585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
29595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
29605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
29615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
29625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
29635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
29645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
29655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
29665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} \
29675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang\
29685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/* Stage5 */ \
29695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang{ \
29705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
29715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
29725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
29735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
29745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
29755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
29765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
29775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
29785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
29795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
29805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
29815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
29825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
29835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
29845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
29855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
29865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
29875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
29885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
29895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
29905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
29915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
29925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
29935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp0 = _mm_add_epi32(tmp0, rounding); \
29945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp1 = _mm_add_epi32(tmp1, rounding); \
29955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp2 = _mm_add_epi32(tmp2, rounding); \
29965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp3 = _mm_add_epi32(tmp3, rounding); \
29975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
29985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
29995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
30005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
30015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
30025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
30035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
30045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
30055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
30065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_4 = stp2_4; \
30075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_7 = stp2_7; \
30085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
30095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
30105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
30115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
30125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
30135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
30145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
30155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
30165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
30175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
30185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_16 = stp2_16; \
30195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_17 = stp2_17; \
30205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
30215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
30225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
30235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_19, stp1_28) \
30245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
30255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
30265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_21, stp1_26) \
30275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
30285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_22 = stp2_22; \
30295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_23 = stp2_23; \
30305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_24 = stp2_24; \
30315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_25 = stp2_25; \
30325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_30 = stp2_30; \
30335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_31 = stp2_31; \
30345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} \
30355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang\
30365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/* Stage6 */ \
30375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang{ \
30385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
30395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
30405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
30415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
30425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
30435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
30445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
30455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
30465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
30475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
30485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
30495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
30505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
30515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
30525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_8 = stp1_8; \
30535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_9 = stp1_9; \
30545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_14 = stp1_14; \
30555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_15 = stp1_15; \
30565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
30575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
30585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
30595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp2_13, stp2_11, stp2_12) \
30605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
30615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
30625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
30635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
30645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
30655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
30665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
30675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
30685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
30695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
30705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
30715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
30725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
30735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
30745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
30755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
30765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
30775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
30785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} \
30795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang\
30805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/* Stage7 */ \
30815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang{ \
30825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
30835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
30845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
30855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
30865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
30875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
30885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
30895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
30905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
30915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
30925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
30935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
30945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
30955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
30965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
30975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
30985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
30995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
31005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
31015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
31025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
31035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
31045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
31055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
31065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
31075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
31085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
31095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_16 = stp2_16; \
31105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_17 = stp2_17; \
31115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_18 = stp2_18; \
31125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_19 = stp2_19; \
31135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
31145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
31155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
31165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_21, stp1_26) \
31175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
31185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
31195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_23, stp1_24) \
31205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
31215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_28 = stp2_28; \
31225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_29 = stp2_29; \
31235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_30 = stp2_30; \
31245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_31 = stp2_31; \
31255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
31265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
31275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang// Only upper-left 8x8 has non-zero coeff
31285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
31295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                 int stride) {
31305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
31315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i final_rounding = _mm_set1_epi16(1<<5);
31325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
31335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  // idct constants for each stage
31345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
31355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
31365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
31375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
31385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
31395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
31405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
31415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
31425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
31435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
31445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
31455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
31465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
31475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
31485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
31495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
31505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
31515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
31525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
31535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
31545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
31555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
31565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
31575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
31585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
31595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
31605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
31615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
31625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
31635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
31645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
31655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
31665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
31675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
31685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
31695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
31705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
31715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
31725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
31735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
31745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
31755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
31765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
31775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
31785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
31795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
31805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
31815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
31825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
31835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          in24, in25, in26, in27, in28, in29, in30, in31;
31845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  __m128i col[128];
31855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
31865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
31875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
31885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
31895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          stp1_30, stp1_31;
31905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
31915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
31925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
31935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
31945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          stp2_30, stp2_31;
31955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
31965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int i, j, i32;
31975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
31985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
31995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (i = 0; i < 8; i++) {
32005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    i32 = (i << 5);
32015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    if (i == 0) {
32025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      // First 1-D idct: first 8 rows
32035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      // Load input data.
32045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in0, input);
32055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in8, input);
32065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in16, input);
32075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in24, input);
32085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in1, input);
32095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in9, input);
32105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in17, input);
32115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in25, input);
32125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in2, input);
32135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in10, input);
32145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in18, input);
32155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in26, input);
32165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in3, input);
32175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in11, input);
32185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in19, input);
32195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in27, input);
32205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
32215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in4, input);
32225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in12, input);
32235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in20, input);
32245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in28, input);
32255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in5, input);
32265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in13, input);
32275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in21, input);
32285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in29, input);
32295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in6, input);
32305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in14, input);
32315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in22, input);
32325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in30, input);
32335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in7, input);
32345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in15, input);
32355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in23, input);
32365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in31, input);
32375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
32385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      // Transpose 32x8 block to 8x32 block
32395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
32405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    in4, in5, in6, in7);
32415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
32425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    in10, in11, in12, in13, in14, in15);
32435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
32445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    in18, in19, in20, in21, in22, in23);
32455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
32465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    in26, in27, in28, in29, in30, in31);
32475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    } else if (i < 4) {
32485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      // First 1-D idct: next 24 zero-coeff rows
32495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 0] = _mm_setzero_si128();
32505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 1] = _mm_setzero_si128();
32515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 2] = _mm_setzero_si128();
32525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 3] = _mm_setzero_si128();
32535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 4] = _mm_setzero_si128();
32545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 5] = _mm_setzero_si128();
32555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 6] = _mm_setzero_si128();
32565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 7] = _mm_setzero_si128();
32575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 8] = _mm_setzero_si128();
32585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 9] = _mm_setzero_si128();
32595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 10] = _mm_setzero_si128();
32605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 11] = _mm_setzero_si128();
32615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 12] = _mm_setzero_si128();
32625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 13] = _mm_setzero_si128();
32635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 14] = _mm_setzero_si128();
32645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 15] = _mm_setzero_si128();
32655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 16] = _mm_setzero_si128();
32665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 17] = _mm_setzero_si128();
32675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 18] = _mm_setzero_si128();
32685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 19] = _mm_setzero_si128();
32695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 20] = _mm_setzero_si128();
32705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 21] = _mm_setzero_si128();
32715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 22] = _mm_setzero_si128();
32725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 23] = _mm_setzero_si128();
32735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 24] = _mm_setzero_si128();
32745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 25] = _mm_setzero_si128();
32755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 26] = _mm_setzero_si128();
32765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 27] = _mm_setzero_si128();
32775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 28] = _mm_setzero_si128();
32785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 29] = _mm_setzero_si128();
32795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 30] = _mm_setzero_si128();
32805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 31] = _mm_setzero_si128();
32815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      continue;
32825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    } else {
32835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      // Second 1-D idct
32845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      j = i - 4;
32855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
32865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      // Transpose 32x8 block to 8x32 block
32875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
32885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
32895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
32905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    in5, in6, in7);
32915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      j += 4;
32925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
32935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
32945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
32955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    in11, in12, in13, in14, in15);
32965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      j += 4;
32975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
32985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
32995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
33005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    in19, in20, in21, in22, in23);
33015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      j += 4;
33025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
33035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
33045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
33055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    in28, in29, in30, in31);
33065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
33075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
33085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    IDCT32_1D
33095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
33105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    // final stage
33115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    if (i < 4) {
33125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      // 1_D: Store 32 intermediate results for each 8x32 block.
33135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
33145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
33155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
33165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
33175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
33185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
33195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
33205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
33215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
33225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
33235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
33245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
33255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
33265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
33275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
33285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
33295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
33305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
33315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
33325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
33335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
33345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
33355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
33365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
33375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
33385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
33395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
33405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
33415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
33425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
33435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
33445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
33455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    } else {
33465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      const __m128i zero = _mm_setzero_si128();
33475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
33485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      // 2_D: Calculate the results and store them to destination.
33495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in0 = _mm_add_epi16(stp1_0, stp1_31);
33505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in1 = _mm_add_epi16(stp1_1, stp1_30);
33515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in2 = _mm_add_epi16(stp1_2, stp1_29);
33525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in3 = _mm_add_epi16(stp1_3, stp1_28);
33535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in4 = _mm_add_epi16(stp1_4, stp1_27);
33545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in5 = _mm_add_epi16(stp1_5, stp1_26);
33555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in6 = _mm_add_epi16(stp1_6, stp1_25);
33565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in7 = _mm_add_epi16(stp1_7, stp1_24);
33575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in8 = _mm_add_epi16(stp1_8, stp1_23);
33585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in9 = _mm_add_epi16(stp1_9, stp1_22);
33595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in10 = _mm_add_epi16(stp1_10, stp1_21);
33605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in11 = _mm_add_epi16(stp1_11, stp1_20);
33615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in12 = _mm_add_epi16(stp1_12, stp1_19);
33625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in13 = _mm_add_epi16(stp1_13, stp1_18);
33635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in14 = _mm_add_epi16(stp1_14, stp1_17);
33645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in15 = _mm_add_epi16(stp1_15, stp1_16);
33655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in16 = _mm_sub_epi16(stp1_15, stp1_16);
33665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in17 = _mm_sub_epi16(stp1_14, stp1_17);
33675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in18 = _mm_sub_epi16(stp1_13, stp1_18);
33685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in19 = _mm_sub_epi16(stp1_12, stp1_19);
33695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in20 = _mm_sub_epi16(stp1_11, stp1_20);
33705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in21 = _mm_sub_epi16(stp1_10, stp1_21);
33715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in22 = _mm_sub_epi16(stp1_9, stp1_22);
33725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in23 = _mm_sub_epi16(stp1_8, stp1_23);
33735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in24 = _mm_sub_epi16(stp1_7, stp1_24);
33745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in25 = _mm_sub_epi16(stp1_6, stp1_25);
33755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in26 = _mm_sub_epi16(stp1_5, stp1_26);
33765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in27 = _mm_sub_epi16(stp1_4, stp1_27);
33775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in28 = _mm_sub_epi16(stp1_3, stp1_28);
33785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in29 = _mm_sub_epi16(stp1_2, stp1_29);
33795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in30 = _mm_sub_epi16(stp1_1, stp1_30);
33805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in31 = _mm_sub_epi16(stp1_0, stp1_31);
33815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
33825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      // Final rounding and shift
33835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in0 = _mm_adds_epi16(in0, final_rounding);
33845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in1 = _mm_adds_epi16(in1, final_rounding);
33855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in2 = _mm_adds_epi16(in2, final_rounding);
33865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in3 = _mm_adds_epi16(in3, final_rounding);
33875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in4 = _mm_adds_epi16(in4, final_rounding);
33885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in5 = _mm_adds_epi16(in5, final_rounding);
33895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in6 = _mm_adds_epi16(in6, final_rounding);
33905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in7 = _mm_adds_epi16(in7, final_rounding);
33915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in8 = _mm_adds_epi16(in8, final_rounding);
33925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in9 = _mm_adds_epi16(in9, final_rounding);
33935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in10 = _mm_adds_epi16(in10, final_rounding);
33945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in11 = _mm_adds_epi16(in11, final_rounding);
33955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in12 = _mm_adds_epi16(in12, final_rounding);
33965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in13 = _mm_adds_epi16(in13, final_rounding);
33975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in14 = _mm_adds_epi16(in14, final_rounding);
33985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in15 = _mm_adds_epi16(in15, final_rounding);
33995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in16 = _mm_adds_epi16(in16, final_rounding);
34005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in17 = _mm_adds_epi16(in17, final_rounding);
34015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in18 = _mm_adds_epi16(in18, final_rounding);
34025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in19 = _mm_adds_epi16(in19, final_rounding);
34035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in20 = _mm_adds_epi16(in20, final_rounding);
34045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in21 = _mm_adds_epi16(in21, final_rounding);
34055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in22 = _mm_adds_epi16(in22, final_rounding);
34065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in23 = _mm_adds_epi16(in23, final_rounding);
34075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in24 = _mm_adds_epi16(in24, final_rounding);
34085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in25 = _mm_adds_epi16(in25, final_rounding);
34095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in26 = _mm_adds_epi16(in26, final_rounding);
34105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in27 = _mm_adds_epi16(in27, final_rounding);
34115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in28 = _mm_adds_epi16(in28, final_rounding);
34125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in29 = _mm_adds_epi16(in29, final_rounding);
34135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in30 = _mm_adds_epi16(in30, final_rounding);
34145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in31 = _mm_adds_epi16(in31, final_rounding);
34155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
34165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in0 = _mm_srai_epi16(in0, 6);
34175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in1 = _mm_srai_epi16(in1, 6);
34185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in2 = _mm_srai_epi16(in2, 6);
34195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in3 = _mm_srai_epi16(in3, 6);
34205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in4 = _mm_srai_epi16(in4, 6);
34215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in5 = _mm_srai_epi16(in5, 6);
34225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in6 = _mm_srai_epi16(in6, 6);
34235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in7 = _mm_srai_epi16(in7, 6);
34245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in8 = _mm_srai_epi16(in8, 6);
34255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in9 = _mm_srai_epi16(in9, 6);
34265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in10 = _mm_srai_epi16(in10, 6);
34275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in11 = _mm_srai_epi16(in11, 6);
34285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in12 = _mm_srai_epi16(in12, 6);
34295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in13 = _mm_srai_epi16(in13, 6);
34305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in14 = _mm_srai_epi16(in14, 6);
34315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in15 = _mm_srai_epi16(in15, 6);
34325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in16 = _mm_srai_epi16(in16, 6);
34335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in17 = _mm_srai_epi16(in17, 6);
34345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in18 = _mm_srai_epi16(in18, 6);
34355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in19 = _mm_srai_epi16(in19, 6);
34365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in20 = _mm_srai_epi16(in20, 6);
34375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in21 = _mm_srai_epi16(in21, 6);
34385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in22 = _mm_srai_epi16(in22, 6);
34395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in23 = _mm_srai_epi16(in23, 6);
34405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in24 = _mm_srai_epi16(in24, 6);
34415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in25 = _mm_srai_epi16(in25, 6);
34425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in26 = _mm_srai_epi16(in26, 6);
34435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in27 = _mm_srai_epi16(in27, 6);
34445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in28 = _mm_srai_epi16(in28, 6);
34455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in29 = _mm_srai_epi16(in29, 6);
34465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in30 = _mm_srai_epi16(in30, 6);
34475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in31 = _mm_srai_epi16(in31, 6);
34485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
34495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in0);
34505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in1);
34515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in2);
34525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in3);
34535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in4);
34545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in5);
34555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in6);
34565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in7);
34575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in8);
34585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in9);
34595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in10);
34605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in11);
34615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in12);
34625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in13);
34635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in14);
34645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in15);
34655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in16);
34665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in17);
34675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in18);
34685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in19);
34695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in20);
34705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in21);
34715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in22);
34725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in23);
34735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in24);
34745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in25);
34755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in26);
34765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in27);
34775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in28);
34785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in29);
34795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in30);
34805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in31);
34815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
34825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dest += 8 - (stride * 32);
34835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
34845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
34855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
34865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
34875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
34885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                 int stride) {
3489ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3490ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i final_rounding = _mm_set1_epi16(1<<5);
3491ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3492ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // idct constants for each stage
3493ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3494ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3495ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
3496ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
3497ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
3498ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
3499ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3500ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3501ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3502ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3503ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
3504ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
3505ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
3506ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
3507ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3508ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3509ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3510ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3511ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3512ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
3513ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
3514ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
3515ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
3516ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3517ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3518ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3519ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3520ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3521ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
3522ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
3523ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3524ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3525ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3526ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3527ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3528ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3529ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3530ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3531ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3532ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
3533ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
3534ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3535ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3536ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3537ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3538ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3539ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3540ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
3541ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
3542ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          in24, in25, in26, in27, in28, in29, in30, in31;
3543ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i col[128];
3544ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3545ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3546ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3547ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3548ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_30, stp1_31;
3549ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3550ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3551ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3552ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3553ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp2_30, stp2_31;
3554ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3555f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  int i, j, i32;
3556f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  __m128i zero_idx[16];
3557f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  int zero_flag[2];
3558ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3559ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
3560ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 8; i++) {
3561f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    i32 = (i << 5);
3562ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    if (i < 4) {
3563ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // First 1-D idct
3564ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Load input data.
3565f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in0, input);
3566f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in8, input);
3567f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in16, input);
3568f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in24, input);
3569f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in1, input);
3570f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in9, input);
3571f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in17, input);
3572f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in25, input);
3573f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in2, input);
3574f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in10, input);
3575f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in18, input);
3576f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in26, input);
3577f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in3, input);
3578f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in11, input);
3579f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in19, input);
3580f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in27, input);
3581f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
3582f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in4, input);
3583f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in12, input);
3584f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in20, input);
3585f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in28, input);
3586f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in5, input);
3587f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in13, input);
3588f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in21, input);
3589f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in29, input);
3590f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in6, input);
3591f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in14, input);
3592f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in22, input);
3593f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in30, input);
3594f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in7, input);
3595f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in15, input);
3596f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in23, input);
3597f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in31, input);
3598f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
3599f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      // checking if all entries are zero
3600f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[0] = _mm_or_si128(in0, in1);
3601f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[1] = _mm_or_si128(in2, in3);
3602f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[2] = _mm_or_si128(in4, in5);
3603f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[3] = _mm_or_si128(in6, in7);
3604f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[4] = _mm_or_si128(in8, in9);
3605f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[5] = _mm_or_si128(in10, in11);
3606f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[6] = _mm_or_si128(in12, in13);
3607f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[7] = _mm_or_si128(in14, in15);
3608f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[8] = _mm_or_si128(in16, in17);
3609f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[9] = _mm_or_si128(in18, in19);
3610f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[10] = _mm_or_si128(in20, in21);
3611f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[11] = _mm_or_si128(in22, in23);
3612f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[12] = _mm_or_si128(in24, in25);
3613f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[13] = _mm_or_si128(in26, in27);
3614f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[14] = _mm_or_si128(in28, in29);
3615f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[15] = _mm_or_si128(in30, in31);
3616f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
3617f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3618f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3619f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3620f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3621f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3622f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3623f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3624f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
3625f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
3626f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3627f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3628f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3629f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3630f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3631f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3632f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3633f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
3634f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[0] = _mm_unpackhi_epi64(zero_idx[14], zero_idx[14]);
3635f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[1] = _mm_or_si128(zero_idx[0], zero_idx[14]);
3636f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[2] = _mm_srli_epi64(zero_idx[1], 32);
3637f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_flag[0] = _mm_cvtsi128_si32(zero_idx[1]);
3638f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_flag[1] = _mm_cvtsi128_si32(zero_idx[2]);
3639f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
3640f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      if (!zero_flag[0] && !zero_flag[1]) {
3641f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 0] = _mm_setzero_si128();
3642f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 1] = _mm_setzero_si128();
3643f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 2] = _mm_setzero_si128();
3644f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 3] = _mm_setzero_si128();
3645f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 4] = _mm_setzero_si128();
3646f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 5] = _mm_setzero_si128();
3647f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 6] = _mm_setzero_si128();
3648f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 7] = _mm_setzero_si128();
3649f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 8] = _mm_setzero_si128();
3650f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 9] = _mm_setzero_si128();
3651f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 10] = _mm_setzero_si128();
3652f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 11] = _mm_setzero_si128();
3653f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 12] = _mm_setzero_si128();
3654f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 13] = _mm_setzero_si128();
3655f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 14] = _mm_setzero_si128();
3656f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 15] = _mm_setzero_si128();
3657f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 16] = _mm_setzero_si128();
3658f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 17] = _mm_setzero_si128();
3659f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 18] = _mm_setzero_si128();
3660f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 19] = _mm_setzero_si128();
3661f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 20] = _mm_setzero_si128();
3662f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 21] = _mm_setzero_si128();
3663f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 22] = _mm_setzero_si128();
3664f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 23] = _mm_setzero_si128();
3665f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 24] = _mm_setzero_si128();
3666f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 25] = _mm_setzero_si128();
3667f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 26] = _mm_setzero_si128();
3668f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 27] = _mm_setzero_si128();
3669f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 28] = _mm_setzero_si128();
3670f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 29] = _mm_setzero_si128();
3671f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 30] = _mm_setzero_si128();
3672f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 31] = _mm_setzero_si128();
3673f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        continue;
3674f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      }
3675ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3676ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Transpose 32x8 block to 8x32 block
3677ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
3678ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in4, in5, in6, in7);
3679ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
3680ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in10, in11, in12, in13, in14, in15);
3681ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
3682ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in18, in19, in20, in21, in22, in23);
3683ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
3684ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in26, in27, in28, in29, in30, in31);
3685ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    } else {
3686ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Second 1-D idct
3687ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      j = i - 4;
3688ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3689ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Transpose 32x8 block to 8x32 block
3690ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3691ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3692ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
3693ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in5, in6, in7);
3694ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      j += 4;
3695ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3696ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3697ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
3698ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in11, in12, in13, in14, in15);
3699ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      j += 4;
3700ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3701ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3702ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
3703ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in19, in20, in21, in22, in23);
3704ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      j += 4;
3705ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3706ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3707ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
3708ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in28, in29, in30, in31);
3709ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    }
3710ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
37115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    IDCT32_1D
3712ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3713ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // final stage
3714ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    if (i < 4) {
3715ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // 1_D: Store 32 intermediate results for each 8x32 block.
3716f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
3717f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
3718f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
3719f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
3720f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
3721f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
3722f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
3723f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
3724f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
3725f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
3726f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
3727f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
3728f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
3729f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
3730f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
3731f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
3732f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
3733f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
3734f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
3735f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
3736f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
3737f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
3738f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
3739f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
3740f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
3741f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
3742f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
3743f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
3744f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
3745f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
3746f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
3747f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
3748ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    } else {
3749ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i zero = _mm_setzero_si128();
3750ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3751ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // 2_D: Calculate the results and store them to destination.
3752ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in0 = _mm_add_epi16(stp1_0, stp1_31);
3753ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in1 = _mm_add_epi16(stp1_1, stp1_30);
3754ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in2 = _mm_add_epi16(stp1_2, stp1_29);
3755ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in3 = _mm_add_epi16(stp1_3, stp1_28);
3756ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in4 = _mm_add_epi16(stp1_4, stp1_27);
3757ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in5 = _mm_add_epi16(stp1_5, stp1_26);
3758ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in6 = _mm_add_epi16(stp1_6, stp1_25);
3759ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in7 = _mm_add_epi16(stp1_7, stp1_24);
3760ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in8 = _mm_add_epi16(stp1_8, stp1_23);
3761ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in9 = _mm_add_epi16(stp1_9, stp1_22);
3762ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in10 = _mm_add_epi16(stp1_10, stp1_21);
3763ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in11 = _mm_add_epi16(stp1_11, stp1_20);
3764ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in12 = _mm_add_epi16(stp1_12, stp1_19);
3765ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in13 = _mm_add_epi16(stp1_13, stp1_18);
3766ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in14 = _mm_add_epi16(stp1_14, stp1_17);
3767ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in15 = _mm_add_epi16(stp1_15, stp1_16);
3768ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in16 = _mm_sub_epi16(stp1_15, stp1_16);
3769ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in17 = _mm_sub_epi16(stp1_14, stp1_17);
3770ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in18 = _mm_sub_epi16(stp1_13, stp1_18);
3771ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in19 = _mm_sub_epi16(stp1_12, stp1_19);
3772ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in20 = _mm_sub_epi16(stp1_11, stp1_20);
3773ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in21 = _mm_sub_epi16(stp1_10, stp1_21);
3774ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in22 = _mm_sub_epi16(stp1_9, stp1_22);
3775ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in23 = _mm_sub_epi16(stp1_8, stp1_23);
3776ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in24 = _mm_sub_epi16(stp1_7, stp1_24);
3777ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in25 = _mm_sub_epi16(stp1_6, stp1_25);
3778ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in26 = _mm_sub_epi16(stp1_5, stp1_26);
3779ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in27 = _mm_sub_epi16(stp1_4, stp1_27);
3780ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in28 = _mm_sub_epi16(stp1_3, stp1_28);
3781ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in29 = _mm_sub_epi16(stp1_2, stp1_29);
3782ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in30 = _mm_sub_epi16(stp1_1, stp1_30);
3783ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in31 = _mm_sub_epi16(stp1_0, stp1_31);
3784ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3785ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Final rounding and shift
3786ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in0 = _mm_adds_epi16(in0, final_rounding);
3787ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in1 = _mm_adds_epi16(in1, final_rounding);
3788ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in2 = _mm_adds_epi16(in2, final_rounding);
3789ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in3 = _mm_adds_epi16(in3, final_rounding);
3790ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in4 = _mm_adds_epi16(in4, final_rounding);
3791ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in5 = _mm_adds_epi16(in5, final_rounding);
3792ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in6 = _mm_adds_epi16(in6, final_rounding);
3793ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in7 = _mm_adds_epi16(in7, final_rounding);
3794ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in8 = _mm_adds_epi16(in8, final_rounding);
3795ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in9 = _mm_adds_epi16(in9, final_rounding);
3796ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in10 = _mm_adds_epi16(in10, final_rounding);
3797ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in11 = _mm_adds_epi16(in11, final_rounding);
3798ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in12 = _mm_adds_epi16(in12, final_rounding);
3799ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in13 = _mm_adds_epi16(in13, final_rounding);
3800ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in14 = _mm_adds_epi16(in14, final_rounding);
3801ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in15 = _mm_adds_epi16(in15, final_rounding);
3802ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in16 = _mm_adds_epi16(in16, final_rounding);
3803ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in17 = _mm_adds_epi16(in17, final_rounding);
3804ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in18 = _mm_adds_epi16(in18, final_rounding);
3805ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in19 = _mm_adds_epi16(in19, final_rounding);
3806ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in20 = _mm_adds_epi16(in20, final_rounding);
3807ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in21 = _mm_adds_epi16(in21, final_rounding);
3808ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in22 = _mm_adds_epi16(in22, final_rounding);
3809ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in23 = _mm_adds_epi16(in23, final_rounding);
3810ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in24 = _mm_adds_epi16(in24, final_rounding);
3811ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in25 = _mm_adds_epi16(in25, final_rounding);
3812ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in26 = _mm_adds_epi16(in26, final_rounding);
3813ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in27 = _mm_adds_epi16(in27, final_rounding);
3814ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in28 = _mm_adds_epi16(in28, final_rounding);
3815ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in29 = _mm_adds_epi16(in29, final_rounding);
3816ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in30 = _mm_adds_epi16(in30, final_rounding);
3817ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in31 = _mm_adds_epi16(in31, final_rounding);
3818ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3819ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in0 = _mm_srai_epi16(in0, 6);
3820ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in1 = _mm_srai_epi16(in1, 6);
3821ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in2 = _mm_srai_epi16(in2, 6);
3822ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in3 = _mm_srai_epi16(in3, 6);
3823ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in4 = _mm_srai_epi16(in4, 6);
3824ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in5 = _mm_srai_epi16(in5, 6);
3825ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in6 = _mm_srai_epi16(in6, 6);
3826ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in7 = _mm_srai_epi16(in7, 6);
3827ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in8 = _mm_srai_epi16(in8, 6);
3828ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in9 = _mm_srai_epi16(in9, 6);
3829ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in10 = _mm_srai_epi16(in10, 6);
3830ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in11 = _mm_srai_epi16(in11, 6);
3831ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in12 = _mm_srai_epi16(in12, 6);
3832ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in13 = _mm_srai_epi16(in13, 6);
3833ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in14 = _mm_srai_epi16(in14, 6);
3834ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in15 = _mm_srai_epi16(in15, 6);
3835ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in16 = _mm_srai_epi16(in16, 6);
3836ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in17 = _mm_srai_epi16(in17, 6);
3837ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in18 = _mm_srai_epi16(in18, 6);
3838ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in19 = _mm_srai_epi16(in19, 6);
3839ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in20 = _mm_srai_epi16(in20, 6);
3840ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in21 = _mm_srai_epi16(in21, 6);
3841ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in22 = _mm_srai_epi16(in22, 6);
3842ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in23 = _mm_srai_epi16(in23, 6);
3843ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in24 = _mm_srai_epi16(in24, 6);
3844ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in25 = _mm_srai_epi16(in25, 6);
3845ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in26 = _mm_srai_epi16(in26, 6);
3846ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in27 = _mm_srai_epi16(in27, 6);
3847ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in28 = _mm_srai_epi16(in28, 6);
3848ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in29 = _mm_srai_epi16(in29, 6);
3849ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in30 = _mm_srai_epi16(in30, 6);
3850ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in31 = _mm_srai_epi16(in31, 6);
3851ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3852ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in0);
3853ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in1);
3854ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in2);
3855ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in3);
3856ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in4);
3857ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in5);
3858ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in6);
3859ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in7);
3860ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in8);
3861ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in9);
3862ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in10);
3863ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in11);
3864ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in12);
3865ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in13);
3866ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in14);
3867ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in15);
3868ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in16);
3869ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in17);
3870ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in18);
3871ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in19);
3872ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in20);
3873ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in21);
3874ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in22);
3875ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in23);
3876ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in24);
3877ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in25);
3878ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in26);
3879ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in27);
3880ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in28);
3881ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in29);
3882ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in30);
3883ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in31);
3884ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3885ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      dest += 8 - (stride * 32);
3886ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    }
3887ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
38885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}  //NOLINT
38895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
38905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
38915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  __m128i dc_value;
38925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i zero = _mm_setzero_si128();
38935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int a, i;
38945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
38955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  a = dct_const_round_shift(input[0] * cospi_16_64);
38965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  a = dct_const_round_shift(a * cospi_16_64);
38975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  a = ROUND_POWER_OF_TWO(a, 6);
38985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
38995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  dc_value = _mm_set1_epi16(a);
39005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
39015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (i = 0; i < 4; ++i) {
39025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dest += 8 - (stride * 32);
39355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
3936ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
3937