1ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang/*
2ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *
4ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  Use of this source code is governed by a BSD-style license
5ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  that can be found in the LICENSE file in the root of the source
6ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  tree. An additional intellectual property rights grant can be found
7ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  in the file PATENTS.  All contributing project authors may
8ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  be found in the AUTHORS file in the root of the source tree.
9ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang */
10ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
11ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include <assert.h>
12ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include <emmintrin.h>  // SSE2
13ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "./vpx_config.h"
14ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vpx/vpx_integer.h"
15ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp9/common/vp9_common.h"
16ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp9/common/vp9_idct.h"
17ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
189b35249446b07f40ac5fcc3205f2c048616efacchkuang#define RECON_AND_STORE4X4(dest, in_x) \
199b35249446b07f40ac5fcc3205f2c048616efacchkuang{                                                     \
209b35249446b07f40ac5fcc3205f2c048616efacchkuang  __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
219b35249446b07f40ac5fcc3205f2c048616efacchkuang  d0 = _mm_unpacklo_epi8(d0, zero); \
229b35249446b07f40ac5fcc3205f2c048616efacchkuang  d0 = _mm_add_epi16(in_x, d0); \
239b35249446b07f40ac5fcc3205f2c048616efacchkuang  d0 = _mm_packus_epi16(d0, d0); \
249b35249446b07f40ac5fcc3205f2c048616efacchkuang  *(int *)dest = _mm_cvtsi128_si32(d0); \
259b35249446b07f40ac5fcc3205f2c048616efacchkuang  dest += stride; \
269b35249446b07f40ac5fcc3205f2c048616efacchkuang}
279b35249446b07f40ac5fcc3205f2c048616efacchkuang
285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
29ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i zero = _mm_setzero_si128();
30ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i eight = _mm_set1_epi16(8);
31ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
32ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                    (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
33ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                    (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
34ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                    (int16_t)cospi_8_64, (int16_t)cospi_24_64);
35ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
36ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i input0, input1, input2, input3;
37ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
38ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Rows
399b35249446b07f40ac5fcc3205f2c048616efacchkuang  input0 = _mm_load_si128((const __m128i *)input);
409b35249446b07f40ac5fcc3205f2c048616efacchkuang  input2 = _mm_load_si128((const __m128i *)(input + 8));
41ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
42ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Construct i3, i1, i3, i1, i2, i0, i2, i0
43ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_shufflelo_epi16(input0, 0xd8);
449b35249446b07f40ac5fcc3205f2c048616efacchkuang  input0 = _mm_shufflehi_epi16(input0, 0xd8);
45ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_shufflelo_epi16(input2, 0xd8);
469b35249446b07f40ac5fcc3205f2c048616efacchkuang  input2 = _mm_shufflehi_epi16(input2, 0xd8);
47ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
489b35249446b07f40ac5fcc3205f2c048616efacchkuang  input1 = _mm_unpackhi_epi32(input0, input0);
49ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_unpacklo_epi32(input0, input0);
509b35249446b07f40ac5fcc3205f2c048616efacchkuang  input3 = _mm_unpackhi_epi32(input2, input2);
51ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_unpacklo_epi32(input2, input2);
52ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
53ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 1
54ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_madd_epi16(input0, cst);
55ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_madd_epi16(input1, cst);
56ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_madd_epi16(input2, cst);
57ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_madd_epi16(input3, cst);
58ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
59ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_add_epi32(input0, rounding);
60ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_add_epi32(input1, rounding);
61ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_add_epi32(input2, rounding);
62ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_add_epi32(input3, rounding);
63ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
64ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
65ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
66ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
67ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
68ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
69ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 2
709b35249446b07f40ac5fcc3205f2c048616efacchkuang  input0 = _mm_packs_epi32(input0, input1);
719b35249446b07f40ac5fcc3205f2c048616efacchkuang  input1 = _mm_packs_epi32(input2, input3);
72ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
73ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Transpose
749b35249446b07f40ac5fcc3205f2c048616efacchkuang  input2 = _mm_unpacklo_epi16(input0, input1);
759b35249446b07f40ac5fcc3205f2c048616efacchkuang  input3 = _mm_unpackhi_epi16(input0, input1);
769b35249446b07f40ac5fcc3205f2c048616efacchkuang  input0 = _mm_unpacklo_epi32(input2, input3);
779b35249446b07f40ac5fcc3205f2c048616efacchkuang  input1 = _mm_unpackhi_epi32(input2, input3);
78ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
79ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Switch column2, column 3, and then, we got:
80ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // input2: column1, column 0;  input3: column2, column 3.
81ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_shuffle_epi32(input1, 0x4e);
82ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_add_epi16(input0, input1);
83ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_sub_epi16(input0, input1);
84ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
85ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Columns
86ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Construct i3, i1, i3, i1, i2, i0, i2, i0
879b35249446b07f40ac5fcc3205f2c048616efacchkuang  input0 = _mm_unpacklo_epi32(input2, input2);
889b35249446b07f40ac5fcc3205f2c048616efacchkuang  input1 = _mm_unpackhi_epi32(input2, input2);
899b35249446b07f40ac5fcc3205f2c048616efacchkuang  input2 = _mm_unpackhi_epi32(input3, input3);
90ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_unpacklo_epi32(input3, input3);
91ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
92ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 1
93ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_madd_epi16(input0, cst);
94ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_madd_epi16(input1, cst);
95ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_madd_epi16(input2, cst);
96ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_madd_epi16(input3, cst);
97ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
98ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_add_epi32(input0, rounding);
99ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_add_epi32(input1, rounding);
100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_add_epi32(input2, rounding);
101ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_add_epi32(input3, rounding);
102ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
103ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
105ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
106ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 2
1099b35249446b07f40ac5fcc3205f2c048616efacchkuang  input0 = _mm_packs_epi32(input0, input2);
1109b35249446b07f40ac5fcc3205f2c048616efacchkuang  input1 = _mm_packs_epi32(input1, input3);
111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Transpose
1139b35249446b07f40ac5fcc3205f2c048616efacchkuang  input2 = _mm_unpacklo_epi16(input0, input1);
1149b35249446b07f40ac5fcc3205f2c048616efacchkuang  input3 = _mm_unpackhi_epi16(input0, input1);
1159b35249446b07f40ac5fcc3205f2c048616efacchkuang  input0 = _mm_unpacklo_epi32(input2, input3);
1169b35249446b07f40ac5fcc3205f2c048616efacchkuang  input1 = _mm_unpackhi_epi32(input2, input3);
117ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
118ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Switch column2, column 3, and then, we got:
119ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // input2: column1, column 0;  input3: column2, column 3.
120ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_shuffle_epi32(input1, 0x4e);
121ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_add_epi16(input0, input1);
122ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_sub_epi16(input0, input1);
123ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
124ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Final round and shift
125ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_add_epi16(input2, eight);
126ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_add_epi16(input3, eight);
127ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
128ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_srai_epi16(input2, 4);
129ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_srai_epi16(input3, 4);
130ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1319b35249446b07f40ac5fcc3205f2c048616efacchkuang  // Reconstruction and Store
1329b35249446b07f40ac5fcc3205f2c048616efacchkuang  {
1339b35249446b07f40ac5fcc3205f2c048616efacchkuang     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
1349b35249446b07f40ac5fcc3205f2c048616efacchkuang     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
1359b35249446b07f40ac5fcc3205f2c048616efacchkuang     d0 = _mm_unpacklo_epi32(d0,
1369b35249446b07f40ac5fcc3205f2c048616efacchkuang          _mm_cvtsi32_si128(*(const int *) (dest + stride)));
1379b35249446b07f40ac5fcc3205f2c048616efacchkuang     d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(
1389b35249446b07f40ac5fcc3205f2c048616efacchkuang                    *(const int *) (dest + stride * 3)), d2);
1399b35249446b07f40ac5fcc3205f2c048616efacchkuang     d0 = _mm_unpacklo_epi8(d0, zero);
1409b35249446b07f40ac5fcc3205f2c048616efacchkuang     d2 = _mm_unpacklo_epi8(d2, zero);
1419b35249446b07f40ac5fcc3205f2c048616efacchkuang     d0 = _mm_add_epi16(d0, input2);
1429b35249446b07f40ac5fcc3205f2c048616efacchkuang     d2 = _mm_add_epi16(d2, input3);
1439b35249446b07f40ac5fcc3205f2c048616efacchkuang     d0 = _mm_packus_epi16(d0, d2);
1449b35249446b07f40ac5fcc3205f2c048616efacchkuang     // store input0
1459b35249446b07f40ac5fcc3205f2c048616efacchkuang     *(int *)dest = _mm_cvtsi128_si32(d0);
1469b35249446b07f40ac5fcc3205f2c048616efacchkuang     // store input1
1479b35249446b07f40ac5fcc3205f2c048616efacchkuang     d0 = _mm_srli_si128(d0, 4);
1489b35249446b07f40ac5fcc3205f2c048616efacchkuang     *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
1499b35249446b07f40ac5fcc3205f2c048616efacchkuang     // store input2
1509b35249446b07f40ac5fcc3205f2c048616efacchkuang     d0 = _mm_srli_si128(d0, 4);
1519b35249446b07f40ac5fcc3205f2c048616efacchkuang     *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
1529b35249446b07f40ac5fcc3205f2c048616efacchkuang     // store input3
1539b35249446b07f40ac5fcc3205f2c048616efacchkuang     d0 = _mm_srli_si128(d0, 4);
1549b35249446b07f40ac5fcc3205f2c048616efacchkuang     *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
155ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
156ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
157ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
15991037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i dc_value;
16091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i zero = _mm_setzero_si128();
16191037db265ecdd914a26e056cf69207b4f50924ehkuang  int a;
16291037db265ecdd914a26e056cf69207b4f50924ehkuang
16391037db265ecdd914a26e056cf69207b4f50924ehkuang  a = dct_const_round_shift(input[0] * cospi_16_64);
16491037db265ecdd914a26e056cf69207b4f50924ehkuang  a = dct_const_round_shift(a * cospi_16_64);
16591037db265ecdd914a26e056cf69207b4f50924ehkuang  a = ROUND_POWER_OF_TWO(a, 4);
16691037db265ecdd914a26e056cf69207b4f50924ehkuang
16791037db265ecdd914a26e056cf69207b4f50924ehkuang  dc_value = _mm_set1_epi16(a);
16891037db265ecdd914a26e056cf69207b4f50924ehkuang
16991037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, dc_value);
17091037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, dc_value);
17191037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, dc_value);
17291037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, dc_value);
17391037db265ecdd914a26e056cf69207b4f50924ehkuang}
17491037db265ecdd914a26e056cf69207b4f50924ehkuang
17591037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void transpose_4x4(__m128i *res) {
17691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
177b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
17891037db265ecdd914a26e056cf69207b4f50924ehkuang
179b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
180b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
18191037db265ecdd914a26e056cf69207b4f50924ehkuang}
18291037db265ecdd914a26e056cf69207b4f50924ehkuang
183b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void idct4_sse2(__m128i *in) {
18491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
18591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
18691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
18791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
18891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
18991037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i u[8], v[8];
19091037db265ecdd914a26e056cf69207b4f50924ehkuang
19191037db265ecdd914a26e056cf69207b4f50924ehkuang  transpose_4x4(in);
19291037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 1
193b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
194b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
19591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
19691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
19791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
19891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
19991037db265ecdd914a26e056cf69207b4f50924ehkuang
20091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
20191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
20291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
20391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
20491037db265ecdd914a26e056cf69207b4f50924ehkuang
20591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
20691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
20791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
20891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
20991037db265ecdd914a26e056cf69207b4f50924ehkuang
210b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[0] = _mm_packs_epi32(v[0], v[1]);
211b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[1] = _mm_packs_epi32(v[3], v[2]);
21291037db265ecdd914a26e056cf69207b4f50924ehkuang
21391037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 2
214b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[0] = _mm_add_epi16(u[0], u[1]);
215b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[1] = _mm_sub_epi16(u[0], u[1]);
216b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[1] = _mm_shuffle_epi32(in[1], 0x4E);
21791037db265ecdd914a26e056cf69207b4f50924ehkuang}
21891037db265ecdd914a26e056cf69207b4f50924ehkuang
219b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void iadst4_sse2(__m128i *in) {
22091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
22191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
22291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
22391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
22491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
22591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i kZero = _mm_set1_epi16(0);
22691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
22791037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i u[8], v[8], in7;
22891037db265ecdd914a26e056cf69207b4f50924ehkuang
22991037db265ecdd914a26e056cf69207b4f50924ehkuang  transpose_4x4(in);
230b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in7 = _mm_srli_si128(in[1], 8);
231b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in7 = _mm_add_epi16(in7, in[0]);
232b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in7 = _mm_sub_epi16(in7, in[1]);
23391037db265ecdd914a26e056cf69207b4f50924ehkuang
234b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
235b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
23691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(in7, kZero);
237b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[3] = _mm_unpackhi_epi16(in[0], kZero);
23891037db265ecdd914a26e056cf69207b4f50924ehkuang
23991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
24091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
24191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
24291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
24391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
24491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
24591037db265ecdd914a26e056cf69207b4f50924ehkuang
24691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], v[1]);
24791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[3], v[4]);
24891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = v[2];
24991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(u[0], u[1]);
25091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_slli_epi32(v[5], 2);
25191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(u[3], v[5]);
25291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_sub_epi32(u[5], u[4]);
25391037db265ecdd914a26e056cf69207b4f50924ehkuang
25491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
25591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
25691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
25791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
25891037db265ecdd914a26e056cf69207b4f50924ehkuang
25991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
26091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
26191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
26291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
26391037db265ecdd914a26e056cf69207b4f50924ehkuang
264b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[0] = _mm_packs_epi32(u[0], u[1]);
265b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[1] = _mm_packs_epi32(u[2], u[3]);
26691037db265ecdd914a26e056cf69207b4f50924ehkuang}
26791037db265ecdd914a26e056cf69207b4f50924ehkuang
2685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
2695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            int tx_type) {
270b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i in[2];
27191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i zero = _mm_setzero_si128();
27291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i eight = _mm_set1_epi16(8);
27391037db265ecdd914a26e056cf69207b4f50924ehkuang
274b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[0]= _mm_loadu_si128((const __m128i *)(input));
275b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[1]= _mm_loadu_si128((const __m128i *)(input + 8));
27691037db265ecdd914a26e056cf69207b4f50924ehkuang
27791037db265ecdd914a26e056cf69207b4f50924ehkuang  switch (tx_type) {
27891037db265ecdd914a26e056cf69207b4f50924ehkuang    case 0:  // DCT_DCT
279b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      idct4_sse2(in);
280b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      idct4_sse2(in);
28191037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
28291037db265ecdd914a26e056cf69207b4f50924ehkuang    case 1:  // ADST_DCT
283b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      idct4_sse2(in);
284b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      iadst4_sse2(in);
28591037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
28691037db265ecdd914a26e056cf69207b4f50924ehkuang    case 2:  // DCT_ADST
287b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      iadst4_sse2(in);
288b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      idct4_sse2(in);
28991037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
29091037db265ecdd914a26e056cf69207b4f50924ehkuang    case 3:  // ADST_ADST
291b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      iadst4_sse2(in);
292b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      iadst4_sse2(in);
29391037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
29491037db265ecdd914a26e056cf69207b4f50924ehkuang    default:
29591037db265ecdd914a26e056cf69207b4f50924ehkuang      assert(0);
29691037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
29791037db265ecdd914a26e056cf69207b4f50924ehkuang  }
29891037db265ecdd914a26e056cf69207b4f50924ehkuang
29991037db265ecdd914a26e056cf69207b4f50924ehkuang  // Final round and shift
30091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_add_epi16(in[0], eight);
30191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_add_epi16(in[1], eight);
30291037db265ecdd914a26e056cf69207b4f50924ehkuang
30391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_srai_epi16(in[0], 4);
30491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_srai_epi16(in[1], 4);
30591037db265ecdd914a26e056cf69207b4f50924ehkuang
306b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // Reconstruction and Store
307b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  {
308b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
309b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
310b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     d0 = _mm_unpacklo_epi32(d0,
311b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian          _mm_cvtsi32_si128(*(const int *) (dest + stride)));
312b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     d2 = _mm_unpacklo_epi32(d2, _mm_cvtsi32_si128(
313b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                    *(const int *) (dest + stride * 3)));
314b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     d0 = _mm_unpacklo_epi8(d0, zero);
315b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     d2 = _mm_unpacklo_epi8(d2, zero);
316b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     d0 = _mm_add_epi16(d0, in[0]);
317b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     d2 = _mm_add_epi16(d2, in[1]);
318b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     d0 = _mm_packus_epi16(d0, d2);
319b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     // store result[0]
320b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     *(int *)dest = _mm_cvtsi128_si32(d0);
321b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     // store result[1]
322b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     d0 = _mm_srli_si128(d0, 4);
323b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
324b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     // store result[2]
325b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     d0 = _mm_srli_si128(d0, 4);
326b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
327b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     // store result[3]
328b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     d0 = _mm_srli_si128(d0, 4);
329b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian     *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
330b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  }
33191037db265ecdd914a26e056cf69207b4f50924ehkuang}
33291037db265ecdd914a26e056cf69207b4f50924ehkuang
333ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
334ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                      out0, out1, out2, out3, out4, out5, out6, out7) \
335ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {                                                     \
336ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
337ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
338ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
339ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
340ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
341ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
342ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
343ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
344ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                                        \
345ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
346ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
347ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
348ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
349ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
350ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
351ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
352ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
353ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                                            \
354ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
355ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
356ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
357ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
358ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
359ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
360ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
361ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
362ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
363ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
364b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \
365b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                         out0, out1, out2, out3) \
366b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  {                                              \
367b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
368b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
369b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
370b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
371b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    \
372ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
373ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
374ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
375ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
376b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    \
377ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
378ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
379ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
380ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
381ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
382ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
383b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
384ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {                                                     \
385ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
386ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
387ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                                        \
388ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);  /* i1 i0 */  \
389ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);  /* i3 i2 */  \
390b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  }
391b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
392b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
393b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  {                                            \
394b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
395b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
396b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
397b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
398ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
399ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
400ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang// Define Macro for multiplying elements by constants and adding them together.
401ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
402ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                               cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
403ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {   \
404ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp0 = _mm_madd_epi16(lo_0, cst0); \
405ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp1 = _mm_madd_epi16(hi_0, cst0); \
406ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp2 = _mm_madd_epi16(lo_0, cst1); \
407ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp3 = _mm_madd_epi16(hi_0, cst1); \
408ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp4 = _mm_madd_epi16(lo_1, cst2); \
409ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp5 = _mm_madd_epi16(hi_1, cst2); \
410ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp6 = _mm_madd_epi16(lo_1, cst3); \
411ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp7 = _mm_madd_epi16(hi_1, cst3); \
412ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      \
413ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp0 = _mm_add_epi32(tmp0, rounding); \
414ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp1 = _mm_add_epi32(tmp1, rounding); \
415ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp2 = _mm_add_epi32(tmp2, rounding); \
416ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp3 = _mm_add_epi32(tmp3, rounding); \
417ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp4 = _mm_add_epi32(tmp4, rounding); \
418ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp5 = _mm_add_epi32(tmp5, rounding); \
419ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp6 = _mm_add_epi32(tmp6, rounding); \
420ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp7 = _mm_add_epi32(tmp7, rounding); \
421ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      \
422ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
423ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
424ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
425ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
426ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
427ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
428ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
429ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
430ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      \
431ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      res0 = _mm_packs_epi32(tmp0, tmp1); \
432ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      res1 = _mm_packs_epi32(tmp2, tmp3); \
433ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      res2 = _mm_packs_epi32(tmp4, tmp5); \
434ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      res3 = _mm_packs_epi32(tmp6, tmp7); \
435ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
436ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
437b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
438b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  {   \
439b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp0 = _mm_madd_epi16(lo_0, cst0); \
440b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp1 = _mm_madd_epi16(hi_0, cst0); \
441b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp2 = _mm_madd_epi16(lo_0, cst1); \
442b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp3 = _mm_madd_epi16(hi_0, cst1); \
443b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      \
444b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp0 = _mm_add_epi32(tmp0, rounding); \
445b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp1 = _mm_add_epi32(tmp1, rounding); \
446b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp2 = _mm_add_epi32(tmp2, rounding); \
447b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp3 = _mm_add_epi32(tmp3, rounding); \
448b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      \
449b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
450b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
451b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
452b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
453b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      \
454b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      res0 = _mm_packs_epi32(tmp0, tmp1); \
455b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      res1 = _mm_packs_epi32(tmp2, tmp3); \
456b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  }
457b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
458b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
459b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                 out0, out1, out2, out3, out4, out5, out6, out7)  \
460b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  { \
461ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage1 */      \
462ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
463ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
464ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
465ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
466ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
467ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
468ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
469ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                          stg1_1, stg1_2, stg1_3, stp1_4,      \
470ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                          stp1_7, stp1_5, stp1_6)              \
471ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  } \
472ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
473ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage2 */ \
474ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
475ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
476ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
477ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
478ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
479ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
480ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
481ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stg2_1, stg2_2, stg2_3, stp2_0,     \
482ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stp2_1, stp2_2, stp2_3)             \
483ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
484ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
485ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
486ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
487ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
488ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  } \
489ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
490ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage3 */ \
491ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
492ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
493ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
494ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
495ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
496ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
497ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
498ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
499ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
500ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
501ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
502ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
503ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
504ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
505ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding); \
506ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_add_epi32(tmp1, rounding); \
507ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding); \
508ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_add_epi32(tmp3, rounding); \
509ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
510ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
511ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
512ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
513ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
514ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
515ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
516ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
517ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  } \
518ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  \
519ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage4  */ \
520b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  out0 = _mm_adds_epi16(stp1_0, stp2_7); \
521b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  out1 = _mm_adds_epi16(stp1_1, stp1_6); \
522b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  out2 = _mm_adds_epi16(stp1_2, stp1_5); \
523b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  out3 = _mm_adds_epi16(stp1_3, stp2_4); \
524b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  out4 = _mm_subs_epi16(stp1_3, stp2_4); \
525b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  out5 = _mm_subs_epi16(stp1_2, stp1_5); \
526b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  out6 = _mm_subs_epi16(stp1_1, stp1_6); \
527b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  out7 = _mm_subs_epi16(stp1_0, stp2_7); \
528b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  }
529ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
530ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define RECON_AND_STORE(dest, in_x) \
531ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {                                                     \
532ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
533ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      d0 = _mm_unpacklo_epi8(d0, zero); \
534f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      d0 = _mm_add_epi16(in_x, d0); \
535f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      d0 = _mm_packus_epi16(d0, d0); \
536f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      _mm_storel_epi64((__m128i *)(dest), d0); \
537ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      dest += stride; \
538ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
539ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
5405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
541ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i zero = _mm_setzero_si128();
542ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
543ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i final_rounding = _mm_set1_epi16(1<<4);
544ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
545ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
546ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
547ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
548ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
549ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
550ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
551ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
552ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
553ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
554ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
555ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
556ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
557ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int i;
558ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
559ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Load input data.
5605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in0 = _mm_load_si128((const __m128i *)input);
5615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
5625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
5635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
5645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
5655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
5665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
5675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
568ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
569ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // 2-D
570ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 2; i++) {
5715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
572b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,
573b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                  in0, in1, in2, in3, in4, in5, in6, in7);
574ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
575ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // 4-stage 1D idct8x8
576b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
577b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian             in0, in1, in2, in3, in4, in5, in6, in7);
578ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
579ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
580ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Final rounding and shift
581ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in0 = _mm_adds_epi16(in0, final_rounding);
582ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in1 = _mm_adds_epi16(in1, final_rounding);
583ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in2 = _mm_adds_epi16(in2, final_rounding);
584ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in3 = _mm_adds_epi16(in3, final_rounding);
585ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in4 = _mm_adds_epi16(in4, final_rounding);
586ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in5 = _mm_adds_epi16(in5, final_rounding);
587ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in6 = _mm_adds_epi16(in6, final_rounding);
588ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in7 = _mm_adds_epi16(in7, final_rounding);
589ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
590ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in0 = _mm_srai_epi16(in0, 5);
591ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in1 = _mm_srai_epi16(in1, 5);
592ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in2 = _mm_srai_epi16(in2, 5);
593ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in3 = _mm_srai_epi16(in3, 5);
594ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in4 = _mm_srai_epi16(in4, 5);
595ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in5 = _mm_srai_epi16(in5, 5);
596ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in6 = _mm_srai_epi16(in6, 5);
597ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in7 = _mm_srai_epi16(in7, 5);
598ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
599ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in0);
600ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in1);
601ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in2);
602ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in3);
603ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in4);
604ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in5);
605ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in6);
606ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in7);
607ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
608ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
6095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
610f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  __m128i dc_value;
611f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  const __m128i zero = _mm_setzero_si128();
612f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  int a;
613f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
614f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  a = dct_const_round_shift(input[0] * cospi_16_64);
615f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  a = dct_const_round_shift(a * cospi_16_64);
616f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  a = ROUND_POWER_OF_TWO(a, 5);
617f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
618f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  dc_value = _mm_set1_epi16(a);
619f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
620f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
621f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
622f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
623f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
624f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
625f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
626f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
627f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
628f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang}
629f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
63091037db265ecdd914a26e056cf69207b4f50924ehkuang// perform 8x8 transpose
63191037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
63291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
63391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
63491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
63591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
63691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
63791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
63891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
63991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
64091037db265ecdd914a26e056cf69207b4f50924ehkuang
64191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
64291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
64391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
64491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
64591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
64691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
64791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
64891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
64991037db265ecdd914a26e056cf69207b4f50924ehkuang
65091037db265ecdd914a26e056cf69207b4f50924ehkuang  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
65191037db265ecdd914a26e056cf69207b4f50924ehkuang  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
65291037db265ecdd914a26e056cf69207b4f50924ehkuang  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
65391037db265ecdd914a26e056cf69207b4f50924ehkuang  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
65491037db265ecdd914a26e056cf69207b4f50924ehkuang  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
65591037db265ecdd914a26e056cf69207b4f50924ehkuang  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
65691037db265ecdd914a26e056cf69207b4f50924ehkuang  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
65791037db265ecdd914a26e056cf69207b4f50924ehkuang  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
65891037db265ecdd914a26e056cf69207b4f50924ehkuang}
65991037db265ecdd914a26e056cf69207b4f50924ehkuang
660b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
661b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
662b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
663b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
664b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
665b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
666b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
667b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
668b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
669b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
670b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
671b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
672b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
673b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
674b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
675b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}
676b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
677b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void idct8_sse2(__m128i *in) {
67891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
67991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
68091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
68191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
68291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
68391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
68491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
68591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
68691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
68791037db265ecdd914a26e056cf69207b4f50924ehkuang
68891037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
68991037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
69091037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
69191037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
69291037db265ecdd914a26e056cf69207b4f50924ehkuang
6935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
694b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
695b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                in0, in1, in2, in3, in4, in5, in6, in7);
69691037db265ecdd914a26e056cf69207b4f50924ehkuang
69791037db265ecdd914a26e056cf69207b4f50924ehkuang  // 4-stage 1D idct8x8
698b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
699b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian           in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
70091037db265ecdd914a26e056cf69207b4f50924ehkuang}
70191037db265ecdd914a26e056cf69207b4f50924ehkuang
702b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void iadst8_sse2(__m128i *in) {
70391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
70491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
70591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
70691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
70791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
70891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
70991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
71091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
71191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
71291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
71391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
71491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
71591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
71691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__const_0 = _mm_set1_epi16(0);
71791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
71891037db265ecdd914a26e056cf69207b4f50924ehkuang
71991037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
72091037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
72191037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
72291037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
72391037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
72491037db265ecdd914a26e056cf69207b4f50924ehkuang
72591037db265ecdd914a26e056cf69207b4f50924ehkuang  // transpose
72691037db265ecdd914a26e056cf69207b4f50924ehkuang  array_transpose_8x8(in, in);
72791037db265ecdd914a26e056cf69207b4f50924ehkuang
72891037db265ecdd914a26e056cf69207b4f50924ehkuang  // properly aligned for butterfly input
72991037db265ecdd914a26e056cf69207b4f50924ehkuang  in0  = in[7];
73091037db265ecdd914a26e056cf69207b4f50924ehkuang  in1  = in[0];
73191037db265ecdd914a26e056cf69207b4f50924ehkuang  in2  = in[5];
73291037db265ecdd914a26e056cf69207b4f50924ehkuang  in3  = in[2];
73391037db265ecdd914a26e056cf69207b4f50924ehkuang  in4  = in[3];
73491037db265ecdd914a26e056cf69207b4f50924ehkuang  in5  = in[4];
73591037db265ecdd914a26e056cf69207b4f50924ehkuang  in6  = in[1];
73691037db265ecdd914a26e056cf69207b4f50924ehkuang  in7  = in[6];
73791037db265ecdd914a26e056cf69207b4f50924ehkuang
73891037db265ecdd914a26e056cf69207b4f50924ehkuang  // column transformation
73991037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 1
74091037db265ecdd914a26e056cf69207b4f50924ehkuang  // interleave and multiply/add into 32-bit integer
74191037db265ecdd914a26e056cf69207b4f50924ehkuang  s0 = _mm_unpacklo_epi16(in0, in1);
74291037db265ecdd914a26e056cf69207b4f50924ehkuang  s1 = _mm_unpackhi_epi16(in0, in1);
74391037db265ecdd914a26e056cf69207b4f50924ehkuang  s2 = _mm_unpacklo_epi16(in2, in3);
74491037db265ecdd914a26e056cf69207b4f50924ehkuang  s3 = _mm_unpackhi_epi16(in2, in3);
74591037db265ecdd914a26e056cf69207b4f50924ehkuang  s4 = _mm_unpacklo_epi16(in4, in5);
74691037db265ecdd914a26e056cf69207b4f50924ehkuang  s5 = _mm_unpackhi_epi16(in4, in5);
74791037db265ecdd914a26e056cf69207b4f50924ehkuang  s6 = _mm_unpacklo_epi16(in6, in7);
74891037db265ecdd914a26e056cf69207b4f50924ehkuang  s7 = _mm_unpackhi_epi16(in6, in7);
74991037db265ecdd914a26e056cf69207b4f50924ehkuang
75091037db265ecdd914a26e056cf69207b4f50924ehkuang  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
75191037db265ecdd914a26e056cf69207b4f50924ehkuang  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
75291037db265ecdd914a26e056cf69207b4f50924ehkuang  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
75391037db265ecdd914a26e056cf69207b4f50924ehkuang  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
75491037db265ecdd914a26e056cf69207b4f50924ehkuang  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
75591037db265ecdd914a26e056cf69207b4f50924ehkuang  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
75691037db265ecdd914a26e056cf69207b4f50924ehkuang  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
75791037db265ecdd914a26e056cf69207b4f50924ehkuang  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
75891037db265ecdd914a26e056cf69207b4f50924ehkuang  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
75991037db265ecdd914a26e056cf69207b4f50924ehkuang  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
76091037db265ecdd914a26e056cf69207b4f50924ehkuang  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
76191037db265ecdd914a26e056cf69207b4f50924ehkuang  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
76291037db265ecdd914a26e056cf69207b4f50924ehkuang  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
76391037db265ecdd914a26e056cf69207b4f50924ehkuang  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
76491037db265ecdd914a26e056cf69207b4f50924ehkuang  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
76591037db265ecdd914a26e056cf69207b4f50924ehkuang  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
76691037db265ecdd914a26e056cf69207b4f50924ehkuang
76791037db265ecdd914a26e056cf69207b4f50924ehkuang  // addition
76891037db265ecdd914a26e056cf69207b4f50924ehkuang  w0 = _mm_add_epi32(u0, u8);
76991037db265ecdd914a26e056cf69207b4f50924ehkuang  w1 = _mm_add_epi32(u1, u9);
77091037db265ecdd914a26e056cf69207b4f50924ehkuang  w2 = _mm_add_epi32(u2, u10);
77191037db265ecdd914a26e056cf69207b4f50924ehkuang  w3 = _mm_add_epi32(u3, u11);
77291037db265ecdd914a26e056cf69207b4f50924ehkuang  w4 = _mm_add_epi32(u4, u12);
77391037db265ecdd914a26e056cf69207b4f50924ehkuang  w5 = _mm_add_epi32(u5, u13);
77491037db265ecdd914a26e056cf69207b4f50924ehkuang  w6 = _mm_add_epi32(u6, u14);
77591037db265ecdd914a26e056cf69207b4f50924ehkuang  w7 = _mm_add_epi32(u7, u15);
77691037db265ecdd914a26e056cf69207b4f50924ehkuang  w8 = _mm_sub_epi32(u0, u8);
77791037db265ecdd914a26e056cf69207b4f50924ehkuang  w9 = _mm_sub_epi32(u1, u9);
77891037db265ecdd914a26e056cf69207b4f50924ehkuang  w10 = _mm_sub_epi32(u2, u10);
77991037db265ecdd914a26e056cf69207b4f50924ehkuang  w11 = _mm_sub_epi32(u3, u11);
78091037db265ecdd914a26e056cf69207b4f50924ehkuang  w12 = _mm_sub_epi32(u4, u12);
78191037db265ecdd914a26e056cf69207b4f50924ehkuang  w13 = _mm_sub_epi32(u5, u13);
78291037db265ecdd914a26e056cf69207b4f50924ehkuang  w14 = _mm_sub_epi32(u6, u14);
78391037db265ecdd914a26e056cf69207b4f50924ehkuang  w15 = _mm_sub_epi32(u7, u15);
78491037db265ecdd914a26e056cf69207b4f50924ehkuang
78591037db265ecdd914a26e056cf69207b4f50924ehkuang  // shift and rounding
78691037db265ecdd914a26e056cf69207b4f50924ehkuang  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
78791037db265ecdd914a26e056cf69207b4f50924ehkuang  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
78891037db265ecdd914a26e056cf69207b4f50924ehkuang  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
78991037db265ecdd914a26e056cf69207b4f50924ehkuang  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
79091037db265ecdd914a26e056cf69207b4f50924ehkuang  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
79191037db265ecdd914a26e056cf69207b4f50924ehkuang  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
79291037db265ecdd914a26e056cf69207b4f50924ehkuang  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
79391037db265ecdd914a26e056cf69207b4f50924ehkuang  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
79491037db265ecdd914a26e056cf69207b4f50924ehkuang  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
79591037db265ecdd914a26e056cf69207b4f50924ehkuang  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
79691037db265ecdd914a26e056cf69207b4f50924ehkuang  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
79791037db265ecdd914a26e056cf69207b4f50924ehkuang  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
79891037db265ecdd914a26e056cf69207b4f50924ehkuang  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
79991037db265ecdd914a26e056cf69207b4f50924ehkuang  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
80091037db265ecdd914a26e056cf69207b4f50924ehkuang  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
80191037db265ecdd914a26e056cf69207b4f50924ehkuang  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
80291037db265ecdd914a26e056cf69207b4f50924ehkuang
80391037db265ecdd914a26e056cf69207b4f50924ehkuang  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
80491037db265ecdd914a26e056cf69207b4f50924ehkuang  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
80591037db265ecdd914a26e056cf69207b4f50924ehkuang  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
80691037db265ecdd914a26e056cf69207b4f50924ehkuang  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
80791037db265ecdd914a26e056cf69207b4f50924ehkuang  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
80891037db265ecdd914a26e056cf69207b4f50924ehkuang  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
80991037db265ecdd914a26e056cf69207b4f50924ehkuang  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
810