vp9_idct_intrin_sse2.c revision 5ae7ac49f08a179e4f054d99fcfc9dce78d26e58
1ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang/*
2ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *
4ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  Use of this source code is governed by a BSD-style license
5ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  that can be found in the LICENSE file in the root of the source
6ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  tree. An additional intellectual property rights grant can be found
7ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  in the file PATENTS.  All contributing project authors may
8ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  be found in the AUTHORS file in the root of the source tree.
9ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang */
10ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
11ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include <assert.h>
12ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include <emmintrin.h>  // SSE2
13ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "./vpx_config.h"
14ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vpx/vpx_integer.h"
15ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp9/common/vp9_common.h"
16ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp9/common/vp9_idct.h"
17ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
19ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i zero = _mm_setzero_si128();
20ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i eight = _mm_set1_epi16(8);
21ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
22ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                    (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
23ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                    (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
24ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                    (int16_t)cospi_8_64, (int16_t)cospi_24_64);
25ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
26ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i input0, input1, input2, input3;
27ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
28ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Rows
295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  input0 = _mm_loadl_epi64((const __m128i *)input);
305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  input1 = _mm_loadl_epi64((const __m128i *)(input + 4));
315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  input2 = _mm_loadl_epi64((const __m128i *)(input + 8));
325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  input3 = _mm_loadl_epi64((const __m128i *)(input + 12));
33ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
34ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Construct i3, i1, i3, i1, i2, i0, i2, i0
35ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_shufflelo_epi16(input0, 0xd8);
36ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_shufflelo_epi16(input1, 0xd8);
37ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_shufflelo_epi16(input2, 0xd8);
38ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_shufflelo_epi16(input3, 0xd8);
39ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
40ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_unpacklo_epi32(input0, input0);
41ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_unpacklo_epi32(input1, input1);
42ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_unpacklo_epi32(input2, input2);
43ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_unpacklo_epi32(input3, input3);
44ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
45ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 1
46ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_madd_epi16(input0, cst);
47ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_madd_epi16(input1, cst);
48ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_madd_epi16(input2, cst);
49ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_madd_epi16(input3, cst);
50ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
51ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_add_epi32(input0, rounding);
52ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_add_epi32(input1, rounding);
53ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_add_epi32(input2, rounding);
54ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_add_epi32(input3, rounding);
55ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
56ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
57ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
58ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
59ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
60ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
61ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 2
62ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_packs_epi32(input0, zero);
63ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_packs_epi32(input1, zero);
64ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_packs_epi32(input2, zero);
65ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_packs_epi32(input3, zero);
66ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
67ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Transpose
68ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_unpacklo_epi16(input0, input1);
69ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_unpacklo_epi16(input2, input3);
70ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_unpacklo_epi32(input1, input3);
71ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_unpackhi_epi32(input1, input3);
72ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
73ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Switch column2, column 3, and then, we got:
74ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // input2: column1, column 0;  input3: column2, column 3.
75ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_shuffle_epi32(input1, 0x4e);
76ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_add_epi16(input0, input1);
77ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_sub_epi16(input0, input1);
78ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
79ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Columns
80ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Construct i3, i1, i3, i1, i2, i0, i2, i0
81ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_shufflelo_epi16(input2, 0xd8);
82ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_shufflehi_epi16(input2, 0xd8);
83ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_shufflehi_epi16(input3, 0xd8);
84ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_shufflelo_epi16(input3, 0xd8);
85ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
86ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_unpacklo_epi32(input0, input0);
87ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_unpackhi_epi32(input1, input1);
88ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_unpackhi_epi32(input2, input2);
89ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_unpacklo_epi32(input3, input3);
90ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
91ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 1
92ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_madd_epi16(input0, cst);
93ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_madd_epi16(input1, cst);
94ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_madd_epi16(input2, cst);
95ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_madd_epi16(input3, cst);
96ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
97ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_add_epi32(input0, rounding);
98ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_add_epi32(input1, rounding);
99ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_add_epi32(input2, rounding);
100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_add_epi32(input3, rounding);
101ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
102ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
103ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
105ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
106ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 2
108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_packs_epi32(input0, zero);
109ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_packs_epi32(input1, zero);
110ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_packs_epi32(input2, zero);
111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_packs_epi32(input3, zero);
112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
113ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Transpose
114ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_unpacklo_epi16(input0, input1);
115ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_unpacklo_epi16(input2, input3);
116ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_unpacklo_epi32(input1, input3);
117ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_unpackhi_epi32(input1, input3);
118ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
119ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Switch column2, column 3, and then, we got:
120ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // input2: column1, column 0;  input3: column2, column 3.
121ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_shuffle_epi32(input1, 0x4e);
122ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_add_epi16(input0, input1);
123ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_sub_epi16(input0, input1);
124ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
125ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Final round and shift
126ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_add_epi16(input2, eight);
127ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_add_epi16(input3, eight);
128ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
129ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_srai_epi16(input2, 4);
130ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_srai_epi16(input3, 4);
131ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
132ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define RECON_AND_STORE4X4(dest, in_x) \
133ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {                                                     \
134ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
135ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      d0 = _mm_unpacklo_epi8(d0, zero); \
136ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      d0 = _mm_add_epi16(in_x, d0); \
137ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      d0 = _mm_packus_epi16(d0, d0); \
138ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      *(int *)dest = _mm_cvtsi128_si32(d0); \
139ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      dest += stride; \
140ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
141ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
142ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_srli_si128(input2, 8);
143ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_srli_si128(input3, 8);
144ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
145ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE4X4(dest, input2);
146ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE4X4(dest, input0);
147ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE4X4(dest, input1);
148ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE4X4(dest, input3);
149ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
150ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
15291037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i dc_value;
15391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i zero = _mm_setzero_si128();
15491037db265ecdd914a26e056cf69207b4f50924ehkuang  int a;
15591037db265ecdd914a26e056cf69207b4f50924ehkuang
15691037db265ecdd914a26e056cf69207b4f50924ehkuang  a = dct_const_round_shift(input[0] * cospi_16_64);
15791037db265ecdd914a26e056cf69207b4f50924ehkuang  a = dct_const_round_shift(a * cospi_16_64);
15891037db265ecdd914a26e056cf69207b4f50924ehkuang  a = ROUND_POWER_OF_TWO(a, 4);
15991037db265ecdd914a26e056cf69207b4f50924ehkuang
16091037db265ecdd914a26e056cf69207b4f50924ehkuang  dc_value = _mm_set1_epi16(a);
16191037db265ecdd914a26e056cf69207b4f50924ehkuang
16291037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, dc_value);
16391037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, dc_value);
16491037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, dc_value);
16591037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, dc_value);
16691037db265ecdd914a26e056cf69207b4f50924ehkuang}
16791037db265ecdd914a26e056cf69207b4f50924ehkuang
16891037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void transpose_4x4(__m128i *res) {
16991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
17091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]);
17191037db265ecdd914a26e056cf69207b4f50924ehkuang  res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
17291037db265ecdd914a26e056cf69207b4f50924ehkuang  res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
17391037db265ecdd914a26e056cf69207b4f50924ehkuang
17491037db265ecdd914a26e056cf69207b4f50924ehkuang  res[1] = _mm_unpackhi_epi64(res[0], res[0]);
17591037db265ecdd914a26e056cf69207b4f50924ehkuang  res[3] = _mm_unpackhi_epi64(res[2], res[2]);
17691037db265ecdd914a26e056cf69207b4f50924ehkuang}
17791037db265ecdd914a26e056cf69207b4f50924ehkuang
1785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void idct4_1d_sse2(__m128i *in) {
17991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
18091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
18191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
18291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
18391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
18491037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i u[8], v[8];
18591037db265ecdd914a26e056cf69207b4f50924ehkuang
18691037db265ecdd914a26e056cf69207b4f50924ehkuang  transpose_4x4(in);
18791037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 1
18891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(in[0], in[2]);
18991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpacklo_epi16(in[1], in[3]);
19091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
19191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
19291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
19391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
19491037db265ecdd914a26e056cf69207b4f50924ehkuang
19591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
19691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
19791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
19891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
19991037db265ecdd914a26e056cf69207b4f50924ehkuang
20091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
20191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
20291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
20391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
20491037db265ecdd914a26e056cf69207b4f50924ehkuang
20591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_packs_epi32(v[0], v[2]);
20691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_packs_epi32(v[1], v[3]);
20791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpackhi_epi64(u[0], u[0]);
20891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi64(u[1], u[1]);
20991037db265ecdd914a26e056cf69207b4f50924ehkuang
21091037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 2
21191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_add_epi16(u[0], u[3]);
21291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_add_epi16(u[1], u[2]);
21391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_sub_epi16(u[1], u[2]);
21491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_sub_epi16(u[0], u[3]);
21591037db265ecdd914a26e056cf69207b4f50924ehkuang}
21691037db265ecdd914a26e056cf69207b4f50924ehkuang
2175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void iadst4_1d_sse2(__m128i *in) {
21891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
21991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
22091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
22191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
22291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
22391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i kZero = _mm_set1_epi16(0);
22491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
22591037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i u[8], v[8], in7;
22691037db265ecdd914a26e056cf69207b4f50924ehkuang
22791037db265ecdd914a26e056cf69207b4f50924ehkuang  transpose_4x4(in);
22891037db265ecdd914a26e056cf69207b4f50924ehkuang  in7 = _mm_add_epi16(in[0], in[3]);
22991037db265ecdd914a26e056cf69207b4f50924ehkuang  in7 = _mm_sub_epi16(in7, in[2]);
23091037db265ecdd914a26e056cf69207b4f50924ehkuang
23191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(in[0], in[2]);
23291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpacklo_epi16(in[1], in[3]);
23391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(in7, kZero);
23491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpacklo_epi16(in[1], kZero);
23591037db265ecdd914a26e056cf69207b4f50924ehkuang
23691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
23791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
23891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
23991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
24091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
24191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
24291037db265ecdd914a26e056cf69207b4f50924ehkuang
24391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], v[1]);
24491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[3], v[4]);
24591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = v[2];
24691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(u[0], u[1]);
24791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_slli_epi32(v[5], 2);
24891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(u[3], v[5]);
24991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_sub_epi32(u[5], u[4]);
25091037db265ecdd914a26e056cf69207b4f50924ehkuang
25191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
25291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
25391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
25491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
25591037db265ecdd914a26e056cf69207b4f50924ehkuang
25691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
25791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
25891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
25991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
26091037db265ecdd914a26e056cf69207b4f50924ehkuang
26191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_packs_epi32(u[0], u[2]);
26291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_packs_epi32(u[1], u[3]);
26391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_unpackhi_epi64(in[0], in[0]);
26491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_unpackhi_epi64(in[1], in[1]);
26591037db265ecdd914a26e056cf69207b4f50924ehkuang}
26691037db265ecdd914a26e056cf69207b4f50924ehkuang
2675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
2685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            int tx_type) {
26991037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i in[4];
27091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i zero = _mm_setzero_si128();
27191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i eight = _mm_set1_epi16(8);
27291037db265ecdd914a26e056cf69207b4f50924ehkuang
2735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[0] = _mm_loadl_epi64((const __m128i *)input);
2745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[1] = _mm_loadl_epi64((const __m128i *)(input + 4));
2755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[2] = _mm_loadl_epi64((const __m128i *)(input + 8));
2765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[3] = _mm_loadl_epi64((const __m128i *)(input + 12));
27791037db265ecdd914a26e056cf69207b4f50924ehkuang
27891037db265ecdd914a26e056cf69207b4f50924ehkuang  switch (tx_type) {
27991037db265ecdd914a26e056cf69207b4f50924ehkuang    case 0:  // DCT_DCT
28091037db265ecdd914a26e056cf69207b4f50924ehkuang      idct4_1d_sse2(in);
28191037db265ecdd914a26e056cf69207b4f50924ehkuang      idct4_1d_sse2(in);
28291037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
28391037db265ecdd914a26e056cf69207b4f50924ehkuang    case 1:  // ADST_DCT
28491037db265ecdd914a26e056cf69207b4f50924ehkuang      idct4_1d_sse2(in);
28591037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst4_1d_sse2(in);
28691037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
28791037db265ecdd914a26e056cf69207b4f50924ehkuang    case 2:  // DCT_ADST
28891037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst4_1d_sse2(in);
28991037db265ecdd914a26e056cf69207b4f50924ehkuang      idct4_1d_sse2(in);
29091037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
29191037db265ecdd914a26e056cf69207b4f50924ehkuang    case 3:  // ADST_ADST
29291037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst4_1d_sse2(in);
29391037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst4_1d_sse2(in);
29491037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
29591037db265ecdd914a26e056cf69207b4f50924ehkuang    default:
29691037db265ecdd914a26e056cf69207b4f50924ehkuang      assert(0);
29791037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
29891037db265ecdd914a26e056cf69207b4f50924ehkuang  }
29991037db265ecdd914a26e056cf69207b4f50924ehkuang
30091037db265ecdd914a26e056cf69207b4f50924ehkuang  // Final round and shift
30191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_add_epi16(in[0], eight);
30291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_add_epi16(in[1], eight);
30391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_add_epi16(in[2], eight);
30491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_add_epi16(in[3], eight);
30591037db265ecdd914a26e056cf69207b4f50924ehkuang
30691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_srai_epi16(in[0], 4);
30791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_srai_epi16(in[1], 4);
30891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_srai_epi16(in[2], 4);
30991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_srai_epi16(in[3], 4);
31091037db265ecdd914a26e056cf69207b4f50924ehkuang
31191037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, in[0]);
31291037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, in[1]);
31391037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, in[2]);
31491037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, in[3]);
31591037db265ecdd914a26e056cf69207b4f50924ehkuang}
31691037db265ecdd914a26e056cf69207b4f50924ehkuang
317ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
318ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                      out0, out1, out2, out3, out4, out5, out6, out7) \
319ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {                                                     \
320ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
321ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
322ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
323ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
324ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
325ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
326ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
327ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
328ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                                        \
329ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
330ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
331ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
332ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
333ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
334ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
335ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
336ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
337ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                                            \
338ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
339ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
340ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
341ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
342ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
343ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
344ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
345ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
346ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
347ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
348ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, \
349ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                      out0, out1, out2, out3, out4, out5, out6, out7) \
350ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {                                                     \
351ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
352ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
353ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
354ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
355ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                                        \
356ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
357ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
358ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
359ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
360ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                                            \
361ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
362ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
363ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
364ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
365ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out4 = out5 = out6 = out7 = zero; \
366ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
367ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
368ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1, out2, out3) \
369ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {                                                     \
370ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
371ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
372ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
373ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
374ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                                        \
375ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);  /* i1 i0 */  \
376ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);  /* i3 i2 */  \
377ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in2 = _mm_unpacklo_epi32(tr0_2, tr0_3);  /* i5 i4 */  \
378ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in3 = _mm_unpackhi_epi32(tr0_2, tr0_3);  /* i7 i6 */  \
379ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
380ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
381ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang// Define Macro for multiplying elements by constants and adding them together.
382ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
383ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                               cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
384ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {   \
385ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp0 = _mm_madd_epi16(lo_0, cst0); \
386ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp1 = _mm_madd_epi16(hi_0, cst0); \
387ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp2 = _mm_madd_epi16(lo_0, cst1); \
388ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp3 = _mm_madd_epi16(hi_0, cst1); \
389ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp4 = _mm_madd_epi16(lo_1, cst2); \
390ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp5 = _mm_madd_epi16(hi_1, cst2); \
391ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp6 = _mm_madd_epi16(lo_1, cst3); \
392ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp7 = _mm_madd_epi16(hi_1, cst3); \
393ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      \
394ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp0 = _mm_add_epi32(tmp0, rounding); \
395ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp1 = _mm_add_epi32(tmp1, rounding); \
396ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp2 = _mm_add_epi32(tmp2, rounding); \
397ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp3 = _mm_add_epi32(tmp3, rounding); \
398ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp4 = _mm_add_epi32(tmp4, rounding); \
399ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp5 = _mm_add_epi32(tmp5, rounding); \
400ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp6 = _mm_add_epi32(tmp6, rounding); \
401ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp7 = _mm_add_epi32(tmp7, rounding); \
402ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      \
403ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
404ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
405ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
406ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
407ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
408ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
409ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
410ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
411ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      \
412ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      res0 = _mm_packs_epi32(tmp0, tmp1); \
413ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      res1 = _mm_packs_epi32(tmp2, tmp3); \
414ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      res2 = _mm_packs_epi32(tmp4, tmp5); \
415ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      res3 = _mm_packs_epi32(tmp6, tmp7); \
416ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
417ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
4185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#define IDCT8_1D  \
419ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage1 */      \
420ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
421ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
422ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
423ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
424ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
425ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
426ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
427ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                          stg1_1, stg1_2, stg1_3, stp1_4,      \
428ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                          stp1_7, stp1_5, stp1_6)              \
429ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  } \
430ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
431ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage2 */ \
432ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
433ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
434ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
435ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
436ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
437ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
438ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
439ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stg2_1, stg2_2, stg2_3, stp2_0,     \
440ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stp2_1, stp2_2, stp2_3)             \
441ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
442ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
443ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
444ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
445ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
446ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  } \
447ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
448ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage3 */ \
449ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
450ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
451ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
452ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
453ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
454ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
455ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
456ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
457ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
458ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
459ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
460ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
461ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
462ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
463ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding); \
464ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_add_epi32(tmp1, rounding); \
465ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding); \
466ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_add_epi32(tmp3, rounding); \
467ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
468ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
469ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
470ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
471ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
472ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
473ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
474ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
475ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  } \
476ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  \
477ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage4  */ \
478ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in0 = _mm_adds_epi16(stp1_0, stp2_7); \
479ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in1 = _mm_adds_epi16(stp1_1, stp1_6); \
480ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in2 = _mm_adds_epi16(stp1_2, stp1_5); \
481ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in3 = _mm_adds_epi16(stp1_3, stp2_4); \
482ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in4 = _mm_subs_epi16(stp1_3, stp2_4); \
483ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in5 = _mm_subs_epi16(stp1_2, stp1_5); \
484ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in6 = _mm_subs_epi16(stp1_1, stp1_6); \
485ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in7 = _mm_subs_epi16(stp1_0, stp2_7);
486ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
487ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define RECON_AND_STORE(dest, in_x) \
488ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {                                                     \
489ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
490ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      d0 = _mm_unpacklo_epi8(d0, zero); \
491f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      d0 = _mm_add_epi16(in_x, d0); \
492f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      d0 = _mm_packus_epi16(d0, d0); \
493f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      _mm_storel_epi64((__m128i *)(dest), d0); \
494ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      dest += stride; \
495ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
496ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
4975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
498ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i zero = _mm_setzero_si128();
499ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
500ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i final_rounding = _mm_set1_epi16(1<<4);
501ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
502ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
503ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
504ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
505ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
506ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
507ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
508ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
509ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
510ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
511ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
512ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
513ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
514ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int i;
515ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
516ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Load input data.
5175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in0 = _mm_load_si128((const __m128i *)input);
5185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
5195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
5205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
5215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
5225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
5235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
5245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
525ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
526ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // 2-D
527ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 2; i++) {
5285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
529ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
530ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                  in4, in5, in6, in7);
531ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
532ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // 4-stage 1D idct8x8
5335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    IDCT8_1D
534ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
535ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
536ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Final rounding and shift
537ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in0 = _mm_adds_epi16(in0, final_rounding);
538ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in1 = _mm_adds_epi16(in1, final_rounding);
539ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in2 = _mm_adds_epi16(in2, final_rounding);
540ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in3 = _mm_adds_epi16(in3, final_rounding);
541ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in4 = _mm_adds_epi16(in4, final_rounding);
542ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in5 = _mm_adds_epi16(in5, final_rounding);
543ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in6 = _mm_adds_epi16(in6, final_rounding);
544ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in7 = _mm_adds_epi16(in7, final_rounding);
545ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
546ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in0 = _mm_srai_epi16(in0, 5);
547ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in1 = _mm_srai_epi16(in1, 5);
548ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in2 = _mm_srai_epi16(in2, 5);
549ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in3 = _mm_srai_epi16(in3, 5);
550ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in4 = _mm_srai_epi16(in4, 5);
551ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in5 = _mm_srai_epi16(in5, 5);
552ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in6 = _mm_srai_epi16(in6, 5);
553ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in7 = _mm_srai_epi16(in7, 5);
554ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
555ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in0);
556ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in1);
557ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in2);
558ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in3);
559ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in4);
560ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in5);
561ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in6);
562ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in7);
563ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
564ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
5655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
566f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  __m128i dc_value;
567f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  const __m128i zero = _mm_setzero_si128();
568f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  int a;
569f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
570f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  a = dct_const_round_shift(input[0] * cospi_16_64);
571f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  a = dct_const_round_shift(a * cospi_16_64);
572f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  a = ROUND_POWER_OF_TWO(a, 5);
573f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
574f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  dc_value = _mm_set1_epi16(a);
575f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
576f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
577f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
578f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
579f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
580f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
581f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
582f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
583f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  RECON_AND_STORE(dest, dc_value);
584f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang}
585f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
58691037db265ecdd914a26e056cf69207b4f50924ehkuang// perform 8x8 transpose
58791037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
58891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
58991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
59091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
59191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
59291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
59391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
59491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
59591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
59691037db265ecdd914a26e056cf69207b4f50924ehkuang
59791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
59891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
59991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
60091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
60191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
60291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
60391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
60491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
60591037db265ecdd914a26e056cf69207b4f50924ehkuang
60691037db265ecdd914a26e056cf69207b4f50924ehkuang  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
60791037db265ecdd914a26e056cf69207b4f50924ehkuang  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
60891037db265ecdd914a26e056cf69207b4f50924ehkuang  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
60991037db265ecdd914a26e056cf69207b4f50924ehkuang  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
61091037db265ecdd914a26e056cf69207b4f50924ehkuang  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
61191037db265ecdd914a26e056cf69207b4f50924ehkuang  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
61291037db265ecdd914a26e056cf69207b4f50924ehkuang  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
61391037db265ecdd914a26e056cf69207b4f50924ehkuang  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
61491037db265ecdd914a26e056cf69207b4f50924ehkuang}
61591037db265ecdd914a26e056cf69207b4f50924ehkuang
6165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void idct8_1d_sse2(__m128i *in) {
61791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
61891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
61991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
62091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
62191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
62291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
62391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
62491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
62591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
62691037db265ecdd914a26e056cf69207b4f50924ehkuang
62791037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
62891037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
62991037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
63091037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
63191037db265ecdd914a26e056cf69207b4f50924ehkuang
63291037db265ecdd914a26e056cf69207b4f50924ehkuang  in0 = in[0];
63391037db265ecdd914a26e056cf69207b4f50924ehkuang  in1 = in[1];
63491037db265ecdd914a26e056cf69207b4f50924ehkuang  in2 = in[2];
63591037db265ecdd914a26e056cf69207b4f50924ehkuang  in3 = in[3];
63691037db265ecdd914a26e056cf69207b4f50924ehkuang  in4 = in[4];
63791037db265ecdd914a26e056cf69207b4f50924ehkuang  in5 = in[5];
63891037db265ecdd914a26e056cf69207b4f50924ehkuang  in6 = in[6];
63991037db265ecdd914a26e056cf69207b4f50924ehkuang  in7 = in[7];
64091037db265ecdd914a26e056cf69207b4f50924ehkuang
6415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
64291037db265ecdd914a26e056cf69207b4f50924ehkuang  TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
64391037db265ecdd914a26e056cf69207b4f50924ehkuang                in4, in5, in6, in7);
64491037db265ecdd914a26e056cf69207b4f50924ehkuang
64591037db265ecdd914a26e056cf69207b4f50924ehkuang  // 4-stage 1D idct8x8
6465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  IDCT8_1D
64791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = in0;
64891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = in1;
64991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = in2;
65091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = in3;
65191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = in4;
65291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = in5;
65391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = in6;
65491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = in7;
65591037db265ecdd914a26e056cf69207b4f50924ehkuang}
65691037db265ecdd914a26e056cf69207b4f50924ehkuang
6575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void iadst8_1d_sse2(__m128i *in) {
65891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
65991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
66091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
66191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
66291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
66391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
66491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
66591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
66691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
66791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
66891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
66991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
67091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
67191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__const_0 = _mm_set1_epi16(0);
67291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
67391037db265ecdd914a26e056cf69207b4f50924ehkuang
67491037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
67591037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
67691037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
67791037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
67891037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
67991037db265ecdd914a26e056cf69207b4f50924ehkuang
68091037db265ecdd914a26e056cf69207b4f50924ehkuang  // transpose
68191037db265ecdd914a26e056cf69207b4f50924ehkuang  array_transpose_8x8(in, in);
68291037db265ecdd914a26e056cf69207b4f50924ehkuang
68391037db265ecdd914a26e056cf69207b4f50924ehkuang  // properly aligned for butterfly input
68491037db265ecdd914a26e056cf69207b4f50924ehkuang  in0  = in[7];
68591037db265ecdd914a26e056cf69207b4f50924ehkuang  in1  = in[0];
68691037db265ecdd914a26e056cf69207b4f50924ehkuang  in2  = in[5];
68791037db265ecdd914a26e056cf69207b4f50924ehkuang  in3  = in[2];
68891037db265ecdd914a26e056cf69207b4f50924ehkuang  in4  = in[3];
68991037db265ecdd914a26e056cf69207b4f50924ehkuang  in5  = in[4];
69091037db265ecdd914a26e056cf69207b4f50924ehkuang  in6  = in[1];
69191037db265ecdd914a26e056cf69207b4f50924ehkuang  in7  = in[6];
69291037db265ecdd914a26e056cf69207b4f50924ehkuang
69391037db265ecdd914a26e056cf69207b4f50924ehkuang  // column transformation
69491037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 1
69591037db265ecdd914a26e056cf69207b4f50924ehkuang  // interleave and multiply/add into 32-bit integer
69691037db265ecdd914a26e056cf69207b4f50924ehkuang  s0 = _mm_unpacklo_epi16(in0, in1);
69791037db265ecdd914a26e056cf69207b4f50924ehkuang  s1 = _mm_unpackhi_epi16(in0, in1);
69891037db265ecdd914a26e056cf69207b4f50924ehkuang  s2 = _mm_unpacklo_epi16(in2, in3);
69991037db265ecdd914a26e056cf69207b4f50924ehkuang  s3 = _mm_unpackhi_epi16(in2, in3);
70091037db265ecdd914a26e056cf69207b4f50924ehkuang  s4 = _mm_unpacklo_epi16(in4, in5);
70191037db265ecdd914a26e056cf69207b4f50924ehkuang  s5 = _mm_unpackhi_epi16(in4, in5);
70291037db265ecdd914a26e056cf69207b4f50924ehkuang  s6 = _mm_unpacklo_epi16(in6, in7);
70391037db265ecdd914a26e056cf69207b4f50924ehkuang  s7 = _mm_unpackhi_epi16(in6, in7);
70491037db265ecdd914a26e056cf69207b4f50924ehkuang
70591037db265ecdd914a26e056cf69207b4f50924ehkuang  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
70691037db265ecdd914a26e056cf69207b4f50924ehkuang  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
70791037db265ecdd914a26e056cf69207b4f50924ehkuang  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
70891037db265ecdd914a26e056cf69207b4f50924ehkuang  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
70991037db265ecdd914a26e056cf69207b4f50924ehkuang  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
71091037db265ecdd914a26e056cf69207b4f50924ehkuang  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
71191037db265ecdd914a26e056cf69207b4f50924ehkuang  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
71291037db265ecdd914a26e056cf69207b4f50924ehkuang  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
71391037db265ecdd914a26e056cf69207b4f50924ehkuang  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
71491037db265ecdd914a26e056cf69207b4f50924ehkuang  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
71591037db265ecdd914a26e056cf69207b4f50924ehkuang  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
71691037db265ecdd914a26e056cf69207b4f50924ehkuang  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
71791037db265ecdd914a26e056cf69207b4f50924ehkuang  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
71891037db265ecdd914a26e056cf69207b4f50924ehkuang  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
71991037db265ecdd914a26e056cf69207b4f50924ehkuang  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
72091037db265ecdd914a26e056cf69207b4f50924ehkuang  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
72191037db265ecdd914a26e056cf69207b4f50924ehkuang
72291037db265ecdd914a26e056cf69207b4f50924ehkuang  // addition
72391037db265ecdd914a26e056cf69207b4f50924ehkuang  w0 = _mm_add_epi32(u0, u8);
72491037db265ecdd914a26e056cf69207b4f50924ehkuang  w1 = _mm_add_epi32(u1, u9);
72591037db265ecdd914a26e056cf69207b4f50924ehkuang  w2 = _mm_add_epi32(u2, u10);
72691037db265ecdd914a26e056cf69207b4f50924ehkuang  w3 = _mm_add_epi32(u3, u11);
72791037db265ecdd914a26e056cf69207b4f50924ehkuang  w4 = _mm_add_epi32(u4, u12);
72891037db265ecdd914a26e056cf69207b4f50924ehkuang  w5 = _mm_add_epi32(u5, u13);
72991037db265ecdd914a26e056cf69207b4f50924ehkuang  w6 = _mm_add_epi32(u6, u14);
73091037db265ecdd914a26e056cf69207b4f50924ehkuang  w7 = _mm_add_epi32(u7, u15);
73191037db265ecdd914a26e056cf69207b4f50924ehkuang  w8 = _mm_sub_epi32(u0, u8);
73291037db265ecdd914a26e056cf69207b4f50924ehkuang  w9 = _mm_sub_epi32(u1, u9);
73391037db265ecdd914a26e056cf69207b4f50924ehkuang  w10 = _mm_sub_epi32(u2, u10);
73491037db265ecdd914a26e056cf69207b4f50924ehkuang  w11 = _mm_sub_epi32(u3, u11);
73591037db265ecdd914a26e056cf69207b4f50924ehkuang  w12 = _mm_sub_epi32(u4, u12);
73691037db265ecdd914a26e056cf69207b4f50924ehkuang  w13 = _mm_sub_epi32(u5, u13);
73791037db265ecdd914a26e056cf69207b4f50924ehkuang  w14 = _mm_sub_epi32(u6, u14);
73891037db265ecdd914a26e056cf69207b4f50924ehkuang  w15 = _mm_sub_epi32(u7, u15);
73991037db265ecdd914a26e056cf69207b4f50924ehkuang
74091037db265ecdd914a26e056cf69207b4f50924ehkuang  // shift and rounding
74191037db265ecdd914a26e056cf69207b4f50924ehkuang  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
74291037db265ecdd914a26e056cf69207b4f50924ehkuang  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
74391037db265ecdd914a26e056cf69207b4f50924ehkuang  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
74491037db265ecdd914a26e056cf69207b4f50924ehkuang  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
74591037db265ecdd914a26e056cf69207b4f50924ehkuang  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
74691037db265ecdd914a26e056cf69207b4f50924ehkuang  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
74791037db265ecdd914a26e056cf69207b4f50924ehkuang  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
74891037db265ecdd914a26e056cf69207b4f50924ehkuang  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
74991037db265ecdd914a26e056cf69207b4f50924ehkuang  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
75091037db265ecdd914a26e056cf69207b4f50924ehkuang  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
75191037db265ecdd914a26e056cf69207b4f50924ehkuang  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
75291037db265ecdd914a26e056cf69207b4f50924ehkuang  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
75391037db265ecdd914a26e056cf69207b4f50924ehkuang  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
75491037db265ecdd914a26e056cf69207b4f50924ehkuang  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
75591037db265ecdd914a26e056cf69207b4f50924ehkuang  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
75691037db265ecdd914a26e056cf69207b4f50924ehkuang  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
75791037db265ecdd914a26e056cf69207b4f50924ehkuang
75891037db265ecdd914a26e056cf69207b4f50924ehkuang  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
75991037db265ecdd914a26e056cf69207b4f50924ehkuang  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
76091037db265ecdd914a26e056cf69207b4f50924ehkuang  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
76191037db265ecdd914a26e056cf69207b4f50924ehkuang  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
76291037db265ecdd914a26e056cf69207b4f50924ehkuang  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
76391037db265ecdd914a26e056cf69207b4f50924ehkuang  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
76491037db265ecdd914a26e056cf69207b4f50924ehkuang  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
76591037db265ecdd914a26e056cf69207b4f50924ehkuang  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
76691037db265ecdd914a26e056cf69207b4f50924ehkuang  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
76791037db265ecdd914a26e056cf69207b4f50924ehkuang  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
76891037db265ecdd914a26e056cf69207b4f50924ehkuang  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
76991037db265ecdd914a26e056cf69207b4f50924ehkuang  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
77091037db265ecdd914a26e056cf69207b4f50924ehkuang  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
77191037db265ecdd914a26e056cf69207b4f50924ehkuang  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
77291037db265ecdd914a26e056cf69207b4f50924ehkuang  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
77391037db265ecdd914a26e056cf69207b4f50924ehkuang  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
77491037db265ecdd914a26e056cf69207b4f50924ehkuang
77591037db265ecdd914a26e056cf69207b4f50924ehkuang  // back to 16-bit and pack 8 integers into __m128i
77691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_packs_epi32(u0, u1);
77791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_packs_epi32(u2, u3);
77891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_packs_epi32(u4, u5);
77991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_packs_epi32(u6, u7);
78091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = _mm_packs_epi32(u8, u9);
78191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_packs_epi32(u10, u11);
78291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = _mm_packs_epi32(u12, u13);
78391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_packs_epi32(u14, u15);
78491037db265ecdd914a26e056cf69207b4f50924ehkuang
78591037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 2
78691037db265ecdd914a26e056cf69207b4f50924ehkuang  s0 = _mm_add_epi16(in[0], in[2]);
78791037db265ecdd914a26e056cf69207b4f50924ehkuang  s1 = _mm_add_epi16(in[1], in[3]);
78891037db265ecdd914a26e056cf69207b4f50924ehkuang  s2 = _mm_sub_epi16(in[0], in[2]);
78991037db265ecdd914a26e056cf69207b4f50924ehkuang  s3 = _mm_sub_epi16(in[1], in[3]);
79091037db265ecdd914a26e056cf69207b4f50924ehkuang  u0 = _mm_unpacklo_epi16(in[4], in[5]);
79191037db265ecdd914a26e056cf69207b4f50924ehkuang  u1 = _mm_unpackhi_epi16(in[4], in[5]);
79291037db265ecdd914a26e056cf69207b4f50924ehkuang  u2 = _mm_unpacklo_epi16(in[6], in[7]);
79391037db265ecdd914a26e056cf69207b4f50924ehkuang  u3 = _mm_unpackhi_epi16(in[6], in[7]);
79491037db265ecdd914a26e056cf69207b4f50924ehkuang
79591037db265ecdd914a26e056cf69207b4f50924ehkuang  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
79691037db265ecdd914a26e056cf69207b4f50924ehkuang  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
79791037db265ecdd914a26e056cf69207b4f50924ehkuang  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
79891037db265ecdd914a26e056cf69207b4f50924ehkuang  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
79991037db265ecdd914a26e056cf69207b4f50924ehkuang  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
80091037db265ecdd914a26e056cf69207b4f50924ehkuang  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
80191037db265ecdd914a26e056cf69207b4f50924ehkuang  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
80291037db265ecdd914a26e056cf69207b4f50924ehkuang  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
80391037db265ecdd914a26e056cf69207b4f50924ehkuang
80491037db265ecdd914a26e056cf69207b4f50924ehkuang  w0 = _mm_add_epi32(v0, v4);
80591037db265ecdd914a26e056cf69207b4f50924ehkuang  w1 = _mm_add_epi32(v1, v5);
80691037db265ecdd914a26e056cf69207b4f50924ehkuang  w2 = _mm_add_epi32(v2, v6);
80791037db265ecdd914a26e056cf69207b4f50924ehkuang  w3 = _mm_add_epi32(v3, v7);
80891037db265ecdd914a26e056cf69207b4f50924ehkuang  w4 = _mm_sub_epi32(v0, v4);
80991037db265ecdd914a26e056cf69207b4f50924ehkuang  w5 = _mm_sub_epi32(v1, v5);
81091037db265ecdd914a26e056cf69207b4f50924ehkuang  w6 = _mm_sub_epi32(v2, v6);
81191037db265ecdd914a26e056cf69207b4f50924ehkuang  w7 = _mm_sub_epi32(v3, v7);
81291037db265ecdd914a26e056cf69207b4f50924ehkuang
81391037db265ecdd914a26e056cf69207b4f50924ehkuang  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
81491037db265ecdd914a26e056cf69207b4f50924ehkuang  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
81591037db265ecdd914a26e056cf69207b4f50924ehkuang  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
81691037db265ecdd914a26e056cf69207b4f50924ehkuang  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
81791037db265ecdd914a26e056cf69207b4f50924ehkuang  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
81891037db265ecdd914a26e056cf69207b4f50924ehkuang  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
81991037db265ecdd914a26e056cf69207b4f50924ehkuang  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
82091037db265ecdd914a26e056cf69207b4f50924ehkuang  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
82191037db265ecdd914a26e056cf69207b4f50924ehkuang
82291037db265ecdd914a26e056cf69207b4f50924ehkuang  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
82391037db265ecdd914a26e056cf69207b4f50924ehkuang  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
82491037db265ecdd914a26e056cf69207b4f50924ehkuang  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
82591037db265ecdd914a26e056cf69207b4f50924ehkuang  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
82691037db265ecdd914a26e056cf69207b4f50924ehkuang  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
82791037db265ecdd914a26e056cf69207b4f50924ehkuang  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
82891037db265ecdd914a26e056cf69207b4f50924ehkuang  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
82991037db265ecdd914a26e056cf69207b4f50924ehkuang  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
83091037db265ecdd914a26e056cf69207b4f50924ehkuang
83191037db265ecdd914a26e056cf69207b4f50924ehkuang  // back to 16-bit intergers
83291037db265ecdd914a26e056cf69207b4f50924ehkuang  s4 = _mm_packs_epi32(u0, u1);
83391037db265ecdd914a26e056cf69207b4f50924ehkuang  s5 = _mm_packs_epi32(u2, u3);
83491037db265ecdd914a26e056cf69207b4f50924ehkuang  s6 = _mm_packs_epi32(u4, u5);
83591037db265ecdd914a26e056cf69207b4f50924ehkuang  s7 = _mm_packs_epi32(u6, u7);
83691037db265ecdd914a26e056cf69207b4f50924ehkuang
83791037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 3
83891037db265ecdd914a26e056cf69207b4f50924ehkuang  u0 = _mm_unpacklo_epi16(s2, s3);
83991037db265ecdd914a26e056cf69207b4f50924ehkuang  u1 = _mm_unpackhi_epi16(s2, s3);
84091037db265ecdd914a26e056cf69207b4f50924ehkuang  u2 = _mm_unpacklo_epi16(s6, s7);
84191037db265ecdd914a26e056cf69207b4f50924ehkuang  u3 = _mm_unpackhi_epi16(s6, s7);
84291037db265ecdd914a26e056cf69207b4f50924ehkuang
84391037db265ecdd914a26e056cf69207b4f50924ehkuang  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
84491037db265ecdd914a26e056cf69207b4f50924ehkuang  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
84591037db265ecdd914a26e056cf69207b4f50924ehkuang  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
84691037db265ecdd914a26e056cf69207b4f50924ehkuang  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
84791037db265ecdd914a26e056cf69207b4f50924ehkuang  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
84891037db265ecdd914a26e056cf69207b4f50924ehkuang  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
84991037db265ecdd914a26e056cf69207b4f50924ehkuang  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
85091037db265ecdd914a26e056cf69207b4f50924ehkuang  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
85191037db265ecdd914a26e056cf69207b4f50924ehkuang
85291037db265ecdd914a26e056cf69207b4f50924ehkuang  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
85391037db265ecdd914a26e056cf69207b4f50924ehkuang  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
85491037db265ecdd914a26e056cf69207b4f50924ehkuang  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
85591037db265ecdd914a26e056cf69207b4f50924ehkuang  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
85691037db265ecdd914a26e056cf69207b4f50924ehkuang  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
85791037db265ecdd914a26e056cf69207b4f50924ehkuang  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
85891037db265ecdd914a26e056cf69207b4f50924ehkuang  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
85991037db265ecdd914a26e056cf69207b4f50924ehkuang  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
86091037db265ecdd914a26e056cf69207b4f50924ehkuang
86191037db265ecdd914a26e056cf69207b4f50924ehkuang  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
86291037db265ecdd914a26e056cf69207b4f50924ehkuang  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
86391037db265ecdd914a26e056cf69207b4f50924ehkuang  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
86491037db265ecdd914a26e056cf69207b4f50924ehkuang  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
86591037db265ecdd914a26e056cf69207b4f50924ehkuang  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
86691037db265ecdd914a26e056cf69207b4f50924ehkuang  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
86791037db265ecdd914a26e056cf69207b4f50924ehkuang  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
86891037db265ecdd914a26e056cf69207b4f50924ehkuang  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
86991037db265ecdd914a26e056cf69207b4f50924ehkuang
87091037db265ecdd914a26e056cf69207b4f50924ehkuang  s2 = _mm_packs_epi32(v0, v1);
87191037db265ecdd914a26e056cf69207b4f50924ehkuang  s3 = _mm_packs_epi32(v2, v3);
87291037db265ecdd914a26e056cf69207b4f50924ehkuang  s6 = _mm_packs_epi32(v4, v5);
87391037db265ecdd914a26e056cf69207b4f50924ehkuang  s7 = _mm_packs_epi32(v6, v7);
87491037db265ecdd914a26e056cf69207b4f50924ehkuang
87591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = s0;
87691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_sub_epi16(k__const_0, s4);
87791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = s6;
87891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_sub_epi16(k__const_0, s2);
87991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = s3;
88091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_sub_epi16(k__const_0, s7);
88191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = s5;
88291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_sub_epi16(k__const_0, s1);
88391037db265ecdd914a26e056cf69207b4f50924ehkuang}
88491037db265ecdd914a26e056cf69207b4f50924ehkuang
88591037db265ecdd914a26e056cf69207b4f50924ehkuang
8865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
8875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            int tx_type) {
88891037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i in[8];
88991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i zero = _mm_setzero_si128();
89091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i final_rounding = _mm_set1_epi16(1<<4);
89191037db265ecdd914a26e056cf69207b4f50924ehkuang
89291037db265ecdd914a26e056cf69207b4f50924ehkuang  // load input data
8935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[0] = _mm_load_si128((const __m128i *)input);
8945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1));
8955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2));
8965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3));
8975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4));
8985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5));
8995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6));
9005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));
90191037db265ecdd914a26e056cf69207b4f50924ehkuang
90291037db265ecdd914a26e056cf69207b4f50924ehkuang  switch (tx_type) {
90391037db265ecdd914a26e056cf69207b4f50924ehkuang    case 0:  // DCT_DCT
90491037db265ecdd914a26e056cf69207b4f50924ehkuang      idct8_1d_sse2(in);
90591037db265ecdd914a26e056cf69207b4f50924ehkuang      idct8_1d_sse2(in);
90691037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
90791037db265ecdd914a26e056cf69207b4f50924ehkuang    case 1:  // ADST_DCT
90891037db265ecdd914a26e056cf69207b4f50924ehkuang      idct8_1d_sse2(in);
90991037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst8_1d_sse2(in);
91091037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
91191037db265ecdd914a26e056cf69207b4f50924ehkuang    case 2:  // DCT_ADST
91291037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst8_1d_sse2(in);
91391037db265ecdd914a26e056cf69207b4f50924ehkuang      idct8_1d_sse2(in);
91491037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
91591037db265ecdd914a26e056cf69207b4f50924ehkuang    case 3:  // ADST_ADST
91691037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst8_1d_sse2(in);
91791037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst8_1d_sse2(in);
91891037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
91991037db265ecdd914a26e056cf69207b4f50924ehkuang    default:
92091037db265ecdd914a26e056cf69207b4f50924ehkuang      assert(0);
92191037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
92291037db265ecdd914a26e056cf69207b4f50924ehkuang  }
92391037db265ecdd914a26e056cf69207b4f50924ehkuang
92491037db265ecdd914a26e056cf69207b4f50924ehkuang  // Final rounding and shift
92591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_adds_epi16(in[0], final_rounding);
92691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_adds_epi16(in[1], final_rounding);
92791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_adds_epi16(in[2], final_rounding);
92891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_adds_epi16(in[3], final_rounding);
92991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = _mm_adds_epi16(in[4], final_rounding);
93091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_adds_epi16(in[5], final_rounding);
93191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = _mm_adds_epi16(in[6], final_rounding);
93291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_adds_epi16(in[7], final_rounding);
93391037db265ecdd914a26e056cf69207b4f50924ehkuang
93491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_srai_epi16(in[0], 5);
93591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_srai_epi16(in[1], 5);
93691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_srai_epi16(in[2], 5);
93791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_srai_epi16(in[3], 5);
93891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = _mm_srai_epi16(in[4], 5);
93991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_srai_epi16(in[5], 5);
94091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = _mm_srai_epi16(in[6], 5);
94191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_srai_epi16(in[7], 5);
94291037db265ecdd914a26e056cf69207b4f50924ehkuang
94391037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[0]);
94491037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[1]);
94591037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[2]);
94691037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[3]);
94791037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[4]);
94891037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[5]);
94991037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[6]);
95091037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[7]);
95191037db265ecdd914a26e056cf69207b4f50924ehkuang}
95291037db265ecdd914a26e056cf69207b4f50924ehkuang
9535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
954ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i zero = _mm_setzero_si128();
955ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
956ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i final_rounding = _mm_set1_epi16(1<<4);
957ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
958ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
959ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
960ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
961ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
962ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
963ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
964ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
965ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
966ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
967ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
968ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
969ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
970ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
971ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
972ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Rows. Load 4-row input data.
9735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in0 = _mm_load_si128((const __m128i *)input);
9745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
9755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
9765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
977ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
978ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // 8x4 Transpose
979ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3)
980ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
981ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage1
9825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  { //NOLINT
983ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3);
984ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2);
985ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
986ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_17, stg1_0);
987ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_17, stg1_1);
988ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_madd_epi16(lo_35, stg1_2);
989ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_madd_epi16(lo_35, stg1_3);
990ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
991ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding);
992ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding);
993ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_add_epi32(tmp4, rounding);
994ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_add_epi32(tmp6, rounding);
995ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
996ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
997ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
998ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
999ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1000ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_4 = _mm_packs_epi32(tmp0, zero);
1001ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_7 = _mm_packs_epi32(tmp2, zero);
1002ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_5 = _mm_packs_epi32(tmp4, zero);
1003ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_6 = _mm_packs_epi32(tmp6, zero);
1004ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
1005ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1006ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage2
10075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  { //NOLINT
1008ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2);
1009ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3);
1010ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1011ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_04, stg2_0);
1012ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_04, stg2_1);
1013ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_madd_epi16(lo_26, stg2_2);
1014ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_madd_epi16(lo_26, stg2_3);
1015ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1016ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding);
1017ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding);
1018ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_add_epi32(tmp4, rounding);
1019ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_add_epi32(tmp6, rounding);
1020ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1021ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1022ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
1023ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
1024ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1025ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_0 = _mm_packs_epi32(tmp0, zero);
1026ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_1 = _mm_packs_epi32(tmp2, zero);
1027ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_2 = _mm_packs_epi32(tmp4, zero);
1028ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_3 = _mm_packs_epi32(tmp6, zero);
1029ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1030ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5);
1031ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5);
1032ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6);
1033ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6);
1034ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
1035ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1036ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage3
10375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  { //NOLINT
1038ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
1039ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3);
1040ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2);
1041ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2);
1042ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3);
1043ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1044ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_56, stg3_0);
1045ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
1046ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1047ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding);
1048ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding);
1049ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1050ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1051ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1052ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_5 = _mm_packs_epi32(tmp0, zero);
1053ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_6 = _mm_packs_epi32(tmp2, zero);
1054ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
1055ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1056ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage4
1057ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in0 = _mm_adds_epi16(stp1_0, stp2_7);
1058ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in1 = _mm_adds_epi16(stp1_1, stp1_6);
1059ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in2 = _mm_adds_epi16(stp1_2, stp1_5);
1060ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in3 = _mm_adds_epi16(stp1_3, stp2_4);
1061ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in4 = _mm_subs_epi16(stp1_3, stp2_4);
1062ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in5 = _mm_subs_epi16(stp1_2, stp1_5);
1063ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in6 = _mm_subs_epi16(stp1_1, stp1_6);
1064ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in7 = _mm_subs_epi16(stp1_0, stp2_7);
1065ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1066ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Columns. 4x8 Transpose
1067ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
1068ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                in4, in5, in6, in7)
1069ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1070ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // 1D idct8x8
10715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  IDCT8_1D
1072ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1073ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Final rounding and shift
1074ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in0 = _mm_adds_epi16(in0, final_rounding);
1075ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in1 = _mm_adds_epi16(in1, final_rounding);
1076ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in2 = _mm_adds_epi16(in2, final_rounding);
1077ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in3 = _mm_adds_epi16(in3, final_rounding);
1078ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in4 = _mm_adds_epi16(in4, final_rounding);
1079ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in5 = _mm_adds_epi16(in5, final_rounding);
1080ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in6 = _mm_adds_epi16(in6, final_rounding);
1081ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in7 = _mm_adds_epi16(in7, final_rounding);
1082ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1083ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in0 = _mm_srai_epi16(in0, 5);
1084ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in1 = _mm_srai_epi16(in1, 5);
1085ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in2 = _mm_srai_epi16(in2, 5);
1086ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in3 = _mm_srai_epi16(in3, 5);
1087ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in4 = _mm_srai_epi16(in4, 5);
1088ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in5 = _mm_srai_epi16(in5, 5);
1089ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in6 = _mm_srai_epi16(in6, 5);
1090ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in7 = _mm_srai_epi16(in7, 5);
1091ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1092ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in0);
1093ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in1);
1094ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in2);
1095ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in3);
1096ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in4);
1097ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in5);
1098ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in6);
1099ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in7);
1100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
1101ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
11025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#define IDCT16_1D \
1103ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage2 */ \
1104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
1105ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \
1106ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \
1107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7);   \
1108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_9_7 = _mm_unpackhi_epi16(in9, in7);   \
1109ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_5_11 = _mm_unpacklo_epi16(in5, in11); \
1110ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \
1111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \
1112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \
1113ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1114ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
1115ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stg2_0, stg2_1, stg2_2, stg2_3, \
1116ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stp2_8, stp2_15, stp2_9, stp2_14) \
1117ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1118ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
1119ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stg2_4, stg2_5, stg2_6, stg2_7, \
1120ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stp2_10, stp2_13, stp2_11, stp2_12) \
1121ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  } \
1122ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1123ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage3 */ \
1124ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
1125ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \
1126ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \
1127ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \
1128ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \
1129ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1130ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
1131ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stg3_0, stg3_1, stg3_2, stg3_3, \
1132ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stp1_4, stp1_7, stp1_5, stp1_6) \
1133ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1134ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);  \
1135ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);    \
1136ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
1137ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
1138ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1139ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
1140ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
1141ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
1142ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
1143ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  } \
1144ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  \
1145ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage4 */ \
1146ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
1147ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \
1148ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \
1149ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \
1150ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \
1151ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1152ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
1153ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
1154ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1155ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1156ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1157ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
1158ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stg4_0, stg4_1, stg4_2, stg4_3, \
1159ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stp2_0, stp2_1, stp2_2, stp2_3) \
1160ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1161ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
1162ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
1163ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
1164ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
1165ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1166ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
1167ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stg4_4, stg4_5, stg4_6, stg4_7, \
1168ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stp2_9, stp2_14, stp2_10, stp2_13) \
1169ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  } \
1170ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1171ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage5 */ \
1172ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
1173ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
1174ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
1175ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1176ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
1177ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
1178ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
1179ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
1180ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1181ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
1182ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
1183ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
1184ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
1185ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1186ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding); \
1187ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_add_epi32(tmp1, rounding); \
1188ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding); \
1189ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_add_epi32(tmp3, rounding); \
1190ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1191ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
1192ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
1193ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
1194ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
1195ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1196ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
1197ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
1198ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1199ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
1200ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
1201ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
1202ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
1203ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1204ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
1205ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
1206ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
1207ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
1208ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  } \
1209ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1210ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage6 */ \
1211ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
1212ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1213ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1214ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
1215ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
1216ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1217ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
1218ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
1219ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
1220ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
1221ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
1222ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1223ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1224ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1225ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1226ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
1227ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stg6_0, stg4_0, stg6_0, stg4_0, \
1228ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stp2_10, stp2_13, stp2_11, stp2_12) \
1229ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
1230ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
12315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
12325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                int stride) {
1233ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
1234ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i final_rounding = _mm_set1_epi16(1<<5);
1235ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i zero = _mm_setzero_si128();
1236ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1237ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1238ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
1239ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1240ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
1241ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1242ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
1243ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1244ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
1245ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1246ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1247ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
1248ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1249ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
1250ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1251ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
1252ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1253ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1254ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
1255ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1256ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
1257ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1258ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1259ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1260ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1261ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1262ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero,
1263ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero,
1264ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          in10 = zero, in11 = zero, in12 = zero, in13 = zero,
1265ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          in14 = zero, in15 = zero;
1266ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero,
1267ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero,
1268ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          l12 = zero, l13 = zero, l14 = zero, l15 = zero;
1269ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i r0 = zero, r1 = zero, r2 = zero, r3 = zero, r4 = zero, r5 = zero,
1270ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          r6 = zero, r7 = zero, r8 = zero, r9 = zero, r10 = zero, r11 = zero,
1271ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          r12 = zero, r13 = zero, r14 = zero, r15 = zero;
1272ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
1273ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
1274ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_8_0, stp1_12_0;
1275ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
1276ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
1277ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1278ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int i;
1279ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1280ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // We work on a 8x16 block each time, and loop 4 times for 2-D 16x16 idct.
1281ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 4; i++) {
1282ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // 1-D idct
1283ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    if (i < 2) {
1284ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      if (i == 1) input += 128;
1285ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1286ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Load input data.
12875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in0 = _mm_load_si128((const __m128i *)input);
12885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in8 = _mm_load_si128((const __m128i *)(input + 8 * 1));
12895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in1 = _mm_load_si128((const __m128i *)(input + 8 * 2));
12905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in9 = _mm_load_si128((const __m128i *)(input + 8 * 3));
12915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in2 = _mm_load_si128((const __m128i *)(input + 8 * 4));
12925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in10 = _mm_load_si128((const __m128i *)(input + 8 * 5));
12935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in3 = _mm_load_si128((const __m128i *)(input + 8 * 6));
12945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in11 = _mm_load_si128((const __m128i *)(input + 8 * 7));
12955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in4 = _mm_load_si128((const __m128i *)(input + 8 * 8));
12965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in12 = _mm_load_si128((const __m128i *)(input + 8 * 9));
12975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in5 = _mm_load_si128((const __m128i *)(input + 8 * 10));
12985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in13 = _mm_load_si128((const __m128i *)(input + 8 * 11));
12995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in6 = _mm_load_si128((const __m128i *)(input + 8 * 12));
13005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in14 = _mm_load_si128((const __m128i *)(input + 8 * 13));
13015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in7 = _mm_load_si128((const __m128i *)(input + 8 * 14));
13025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in15 = _mm_load_si128((const __m128i *)(input + 8 * 15));
1303ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1304ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
1305ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in4, in5, in6, in7);
1306ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
1307ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in10, in11, in12, in13, in14, in15);
1308ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    }
1309ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1310ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    if (i == 2) {
1311ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4,
1312ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in5, in6, in7);
1313ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(r0, r1, r2, r3, r4, r5, r6, r7, in8, in9, in10, in11, in12,
1314ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in13, in14, in15);
1315ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    }
1316ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1317ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    if (i == 3) {
1318ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3,
1319ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in4, in5, in6, in7);
1320ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(r8, r9, r10, r11, r12, r13, r14, r15, in8, in9, in10, in11,
1321ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in12, in13, in14, in15);
1322ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    }
1323ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
13245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    IDCT16_1D
1325ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1326ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // Stage7
1327ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    if (i == 0) {
1328ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Left 8x16
1329ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l0 = _mm_add_epi16(stp2_0, stp1_15);
1330ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l1 = _mm_add_epi16(stp2_1, stp1_14);
1331ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l2 = _mm_add_epi16(stp2_2, stp2_13);
1332ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l3 = _mm_add_epi16(stp2_3, stp2_12);
1333ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l4 = _mm_add_epi16(stp2_4, stp2_11);
1334ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l5 = _mm_add_epi16(stp2_5, stp2_10);
1335ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l6 = _mm_add_epi16(stp2_6, stp1_9);
1336ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l7 = _mm_add_epi16(stp2_7, stp1_8);
1337ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l8 = _mm_sub_epi16(stp2_7, stp1_8);
1338ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l9 = _mm_sub_epi16(stp2_6, stp1_9);
1339ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l10 = _mm_sub_epi16(stp2_5, stp2_10);
1340ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l11 = _mm_sub_epi16(stp2_4, stp2_11);
1341ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l12 = _mm_sub_epi16(stp2_3, stp2_12);
1342ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l13 = _mm_sub_epi16(stp2_2, stp2_13);
1343ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l14 = _mm_sub_epi16(stp2_1, stp1_14);
1344ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l15 = _mm_sub_epi16(stp2_0, stp1_15);
1345ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    } else if (i == 1) {
1346ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Right 8x16
1347ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r0 = _mm_add_epi16(stp2_0, stp1_15);
1348ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r1 = _mm_add_epi16(stp2_1, stp1_14);
1349ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r2 = _mm_add_epi16(stp2_2, stp2_13);
1350ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r3 = _mm_add_epi16(stp2_3, stp2_12);
1351ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r4 = _mm_add_epi16(stp2_4, stp2_11);
1352ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r5 = _mm_add_epi16(stp2_5, stp2_10);
1353ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r6 = _mm_add_epi16(stp2_6, stp1_9);
1354ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r7 = _mm_add_epi16(stp2_7, stp1_8);
1355ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r8 = _mm_sub_epi16(stp2_7, stp1_8);
1356ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r9 = _mm_sub_epi16(stp2_6, stp1_9);
1357ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r10 = _mm_sub_epi16(stp2_5, stp2_10);
1358ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r11 = _mm_sub_epi16(stp2_4, stp2_11);
1359ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r12 = _mm_sub_epi16(stp2_3, stp2_12);
1360ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r13 = _mm_sub_epi16(stp2_2, stp2_13);
1361ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r14 = _mm_sub_epi16(stp2_1, stp1_14);
1362ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r15 = _mm_sub_epi16(stp2_0, stp1_15);
1363ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    } else {
1364ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // 2-D
1365ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in0 = _mm_add_epi16(stp2_0, stp1_15);
1366ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in1 = _mm_add_epi16(stp2_1, stp1_14);
1367ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in2 = _mm_add_epi16(stp2_2, stp2_13);
1368ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in3 = _mm_add_epi16(stp2_3, stp2_12);
1369ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in4 = _mm_add_epi16(stp2_4, stp2_11);
1370ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in5 = _mm_add_epi16(stp2_5, stp2_10);
1371ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in6 = _mm_add_epi16(stp2_6, stp1_9);
1372ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in7 = _mm_add_epi16(stp2_7, stp1_8);
1373ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in8 = _mm_sub_epi16(stp2_7, stp1_8);
1374ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in9 = _mm_sub_epi16(stp2_6, stp1_9);
1375ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in10 = _mm_sub_epi16(stp2_5, stp2_10);
1376ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in11 = _mm_sub_epi16(stp2_4, stp2_11);
1377ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in12 = _mm_sub_epi16(stp2_3, stp2_12);
1378ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in13 = _mm_sub_epi16(stp2_2, stp2_13);
1379ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in14 = _mm_sub_epi16(stp2_1, stp1_14);
1380ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in15 = _mm_sub_epi16(stp2_0, stp1_15);
1381ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1382ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Final rounding and shift
1383ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in0 = _mm_adds_epi16(in0, final_rounding);
1384ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in1 = _mm_adds_epi16(in1, final_rounding);
1385ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in2 = _mm_adds_epi16(in2, final_rounding);
1386ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in3 = _mm_adds_epi16(in3, final_rounding);
1387ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in4 = _mm_adds_epi16(in4, final_rounding);
1388ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in5 = _mm_adds_epi16(in5, final_rounding);
1389ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in6 = _mm_adds_epi16(in6, final_rounding);
1390ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in7 = _mm_adds_epi16(in7, final_rounding);
1391ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in8 = _mm_adds_epi16(in8, final_rounding);
1392ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in9 = _mm_adds_epi16(in9, final_rounding);
1393ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in10 = _mm_adds_epi16(in10, final_rounding);
1394ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in11 = _mm_adds_epi16(in11, final_rounding);
1395ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in12 = _mm_adds_epi16(in12, final_rounding);
1396ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in13 = _mm_adds_epi16(in13, final_rounding);
1397ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in14 = _mm_adds_epi16(in14, final_rounding);
1398ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in15 = _mm_adds_epi16(in15, final_rounding);
1399ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1400ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in0 = _mm_srai_epi16(in0, 6);
1401ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in1 = _mm_srai_epi16(in1, 6);
1402ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in2 = _mm_srai_epi16(in2, 6);
1403ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in3 = _mm_srai_epi16(in3, 6);
1404ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in4 = _mm_srai_epi16(in4, 6);
1405ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in5 = _mm_srai_epi16(in5, 6);
1406ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in6 = _mm_srai_epi16(in6, 6);
1407ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in7 = _mm_srai_epi16(in7, 6);
1408ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in8 = _mm_srai_epi16(in8, 6);
1409ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in9 = _mm_srai_epi16(in9, 6);
1410ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in10 = _mm_srai_epi16(in10, 6);
1411ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in11 = _mm_srai_epi16(in11, 6);
1412ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in12 = _mm_srai_epi16(in12, 6);
1413ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in13 = _mm_srai_epi16(in13, 6);
1414ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in14 = _mm_srai_epi16(in14, 6);
1415ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in15 = _mm_srai_epi16(in15, 6);
1416ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1417ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in0);
1418ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in1);
1419ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in2);
1420ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in3);
1421ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in4);
1422ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in5);
1423ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in6);
1424ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in7);
1425ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in8);
1426ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in9);
1427ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in10);
1428ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in11);
1429ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in12);
1430ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in13);
1431ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in14);
1432ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in15);
1433ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1434ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      dest += 8 - (stride * 16);
1435ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    }
1436ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
1437ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
1438ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
14395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
1440f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  __m128i dc_value;
1441f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  const __m128i zero = _mm_setzero_si128();
1442f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  int a, i;
1443f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
1444f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  a = dct_const_round_shift(input[0] * cospi_16_64);
1445f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  a = dct_const_round_shift(a * cospi_16_64);
1446f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  a = ROUND_POWER_OF_TWO(a, 6);
1447f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
1448f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  dc_value = _mm_set1_epi16(a);
1449f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
1450f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  for (i = 0; i < 2; ++i) {
1451f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1452f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1453f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1454f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1455f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1456f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1457f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1458f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1459f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1460f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1461f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1462f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1463f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1464f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1465f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1466f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    RECON_AND_STORE(dest, dc_value);
1467f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    dest += 8 - (stride * 16);
1468f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  }
1469f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang}
1470f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
147191037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
147291037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i tbuf[8];
147391037db265ecdd914a26e056cf69207b4f50924ehkuang  array_transpose_8x8(res0, res0);
147491037db265ecdd914a26e056cf69207b4f50924ehkuang  array_transpose_8x8(res1, tbuf);
147591037db265ecdd914a26e056cf69207b4f50924ehkuang  array_transpose_8x8(res0 + 8, res1);
147691037db265ecdd914a26e056cf69207b4f50924ehkuang  array_transpose_8x8(res1 + 8, res1 + 8);
147791037db265ecdd914a26e056cf69207b4f50924ehkuang
147891037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[8] = tbuf[0];
147991037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[9] = tbuf[1];
148091037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[10] = tbuf[2];
148191037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[11] = tbuf[3];
148291037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[12] = tbuf[4];
148391037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[13] = tbuf[5];
148491037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[14] = tbuf[6];
148591037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[15] = tbuf[7];
148691037db265ecdd914a26e056cf69207b4f50924ehkuang}
148791037db265ecdd914a26e056cf69207b4f50924ehkuang
14885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void iadst16_1d_8col(__m128i *in) {
148991037db265ecdd914a26e056cf69207b4f50924ehkuang  // perform 16x16 1-D ADST for 8 columns
149091037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i s[16], x[16], u[32], v[32];
149191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
149291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
149391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
149491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
149591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
149691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
149791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
149891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
149991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
150091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
150191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
150291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
150391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
150491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
150591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
150691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
150791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
150891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
150991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
151091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
151191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
151291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
151391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
151491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
151591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
151691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
151791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
151891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
151991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
152091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
152191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i kZero = _mm_set1_epi16(0);
152291037db265ecdd914a26e056cf69207b4f50924ehkuang
152391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
152491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
152591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
152691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
152791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
152891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
152991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
153091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
153191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
153291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
153391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
153491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
153591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
153691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
153791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
153891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
153991037db265ecdd914a26e056cf69207b4f50924ehkuang
154091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
154191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
154291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
154391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
154491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
154591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
154691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
154791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
154891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
154991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
155091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
155191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
155291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
155391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
155491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
155591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
155691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
155791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
155891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
155991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
156091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
156191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
156291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
156391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
156491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
156591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
156691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
156791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
156891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
156991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
157091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
157191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
157291037db265ecdd914a26e056cf69207b4f50924ehkuang
157391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], v[16]);
157491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], v[17]);
157591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], v[18]);
157691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], v[19]);
157791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(v[4], v[20]);
157891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(v[5], v[21]);
157991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(v[6], v[22]);
158091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(v[7], v[23]);
158191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_add_epi32(v[8], v[24]);
158291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_add_epi32(v[9], v[25]);
158391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_add_epi32(v[10], v[26]);
158491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_add_epi32(v[11], v[27]);
158591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_add_epi32(v[12], v[28]);
158691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_add_epi32(v[13], v[29]);
158791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_add_epi32(v[14], v[30]);
158891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_add_epi32(v[15], v[31]);
158991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[16] = _mm_sub_epi32(v[0], v[16]);
159091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[17] = _mm_sub_epi32(v[1], v[17]);
159191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[18] = _mm_sub_epi32(v[2], v[18]);
159291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[19] = _mm_sub_epi32(v[3], v[19]);
159391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[20] = _mm_sub_epi32(v[4], v[20]);
159491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[21] = _mm_sub_epi32(v[5], v[21]);
159591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[22] = _mm_sub_epi32(v[6], v[22]);
159691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[23] = _mm_sub_epi32(v[7], v[23]);
159791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[24] = _mm_sub_epi32(v[8], v[24]);
159891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[25] = _mm_sub_epi32(v[9], v[25]);
159991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[26] = _mm_sub_epi32(v[10], v[26]);
160091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[27] = _mm_sub_epi32(v[11], v[27]);
160191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[28] = _mm_sub_epi32(v[12], v[28]);
160291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[29] = _mm_sub_epi32(v[13], v[29]);
160391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[30] = _mm_sub_epi32(v[14], v[30]);
160491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[31] = _mm_sub_epi32(v[15], v[31]);
160591037db265ecdd914a26e056cf69207b4f50924ehkuang
160691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
160791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
160891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
160991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
161091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
161191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
161291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
161391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
161491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
161591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
161691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
161791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
161891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
161991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
162091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
162191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
162291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
162391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
162491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
162591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
162691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
162791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
162891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
162991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
163091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
163191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
163291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
163391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
163491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
163591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
163691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
163791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
163891037db265ecdd914a26e056cf69207b4f50924ehkuang
163991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
164091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
164191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
164291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
164391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
164491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
164591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
164691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
164791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
164891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
164991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
165091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
165191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
165291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
165391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
165491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
165591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
165691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
165791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
165891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
165991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
166091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
166191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
166291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
166391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
166491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
166591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
166691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
166791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
166891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
166991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
167091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
167191037db265ecdd914a26e056cf69207b4f50924ehkuang
167291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[0] = _mm_packs_epi32(u[0], u[1]);
167391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[1] = _mm_packs_epi32(u[2], u[3]);
167491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[2] = _mm_packs_epi32(u[4], u[5]);
167591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[3] = _mm_packs_epi32(u[6], u[7]);
167691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[4] = _mm_packs_epi32(u[8], u[9]);
167791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[5] = _mm_packs_epi32(u[10], u[11]);
167891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[6] = _mm_packs_epi32(u[12], u[13]);
167991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[7] = _mm_packs_epi32(u[14], u[15]);
168091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[8] = _mm_packs_epi32(u[16], u[17]);
168191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[9] = _mm_packs_epi32(u[18], u[19]);
168291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[10] = _mm_packs_epi32(u[20], u[21]);
168391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[11] = _mm_packs_epi32(u[22], u[23]);
168491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[12] = _mm_packs_epi32(u[24], u[25]);
168591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[13] = _mm_packs_epi32(u[26], u[27]);
168691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[14] = _mm_packs_epi32(u[28], u[29]);
168791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[15] = _mm_packs_epi32(u[30], u[31]);
168891037db265ecdd914a26e056cf69207b4f50924ehkuang
168991037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 2
169091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
169191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
169291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
169391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
169491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
169591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
169691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
169791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
169891037db265ecdd914a26e056cf69207b4f50924ehkuang
169991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
170091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
170191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
170291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
170391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
170491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
170591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
170691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
170791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
170891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
170991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
171091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
171191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
171291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
171391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
171491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
171591037db265ecdd914a26e056cf69207b4f50924ehkuang
171691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], v[8]);
171791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], v[9]);
171891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], v[10]);
171991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], v[11]);
172091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(v[4], v[12]);
172191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(v[5], v[13]);
172291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(v[6], v[14]);
172391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(v[7], v[15]);
172491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_sub_epi32(v[0], v[8]);
172591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_sub_epi32(v[1], v[9]);
172691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_sub_epi32(v[2], v[10]);
172791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_sub_epi32(v[3], v[11]);
172891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_sub_epi32(v[4], v[12]);
172991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_sub_epi32(v[5], v[13]);
173091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_sub_epi32(v[6], v[14]);
173191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_sub_epi32(v[7], v[15]);
173291037db265ecdd914a26e056cf69207b4f50924ehkuang
173391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
173491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
173591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
173691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
173791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
173891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
173991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
174091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
174191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
174291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
174391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
174491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
174591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
174691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
174791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
174891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
174991037db265ecdd914a26e056cf69207b4f50924ehkuang
175091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
175191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
175291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
175391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
175491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
175591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
175691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
175791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
175891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
175991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
176091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
176191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
176291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
176391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
176491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
176591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
176691037db265ecdd914a26e056cf69207b4f50924ehkuang
176791037db265ecdd914a26e056cf69207b4f50924ehkuang  x[0] = _mm_add_epi16(s[0], s[4]);
176891037db265ecdd914a26e056cf69207b4f50924ehkuang  x[1] = _mm_add_epi16(s[1], s[5]);
176991037db265ecdd914a26e056cf69207b4f50924ehkuang  x[2] = _mm_add_epi16(s[2], s[6]);
177091037db265ecdd914a26e056cf69207b4f50924ehkuang  x[3] = _mm_add_epi16(s[3], s[7]);
177191037db265ecdd914a26e056cf69207b4f50924ehkuang  x[4] = _mm_sub_epi16(s[0], s[4]);
177291037db265ecdd914a26e056cf69207b4f50924ehkuang  x[5] = _mm_sub_epi16(s[1], s[5]);
177391037db265ecdd914a26e056cf69207b4f50924ehkuang  x[6] = _mm_sub_epi16(s[2], s[6]);
177491037db265ecdd914a26e056cf69207b4f50924ehkuang  x[7] = _mm_sub_epi16(s[3], s[7]);
177591037db265ecdd914a26e056cf69207b4f50924ehkuang  x[8] = _mm_packs_epi32(u[0], u[1]);
177691037db265ecdd914a26e056cf69207b4f50924ehkuang  x[9] = _mm_packs_epi32(u[2], u[3]);
177791037db265ecdd914a26e056cf69207b4f50924ehkuang  x[10] = _mm_packs_epi32(u[4], u[5]);
177891037db265ecdd914a26e056cf69207b4f50924ehkuang  x[11] = _mm_packs_epi32(u[6], u[7]);
177991037db265ecdd914a26e056cf69207b4f50924ehkuang  x[12] = _mm_packs_epi32(u[8], u[9]);
178091037db265ecdd914a26e056cf69207b4f50924ehkuang  x[13] = _mm_packs_epi32(u[10], u[11]);
178191037db265ecdd914a26e056cf69207b4f50924ehkuang  x[14] = _mm_packs_epi32(u[12], u[13]);
178291037db265ecdd914a26e056cf69207b4f50924ehkuang  x[15] = _mm_packs_epi32(u[14], u[15]);
178391037db265ecdd914a26e056cf69207b4f50924ehkuang
178491037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 3
178591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
178691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
178791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
178891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
178991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
179091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
179191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
179291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
179391037db265ecdd914a26e056cf69207b4f50924ehkuang
179491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
179591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
179691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
179791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
179891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
179991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
180091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
180191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
180291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
180391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
180491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
180591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
180691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
180791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
180891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
180991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
181091037db265ecdd914a26e056cf69207b4f50924ehkuang
181191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], v[4]);
181291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], v[5]);
181391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], v[6]);
181491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], v[7]);
181591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_sub_epi32(v[0], v[4]);
181691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_sub_epi32(v[1], v[5]);
181791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_sub_epi32(v[2], v[6]);
181891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_sub_epi32(v[3], v[7]);
181991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_add_epi32(v[8], v[12]);
182091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_add_epi32(v[9], v[13]);
182191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_add_epi32(v[10], v[14]);
182291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_add_epi32(v[11], v[15]);
182391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_sub_epi32(v[8], v[12]);
182491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_sub_epi32(v[9], v[13]);
182591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_sub_epi32(v[10], v[14]);
182691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_sub_epi32(v[11], v[15]);
182791037db265ecdd914a26e056cf69207b4f50924ehkuang
182891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
182991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
183091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
183191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
183291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
183391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
183491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
183591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
183691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
183791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
183891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
183991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
184091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
184191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
184291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
184391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
184491037db265ecdd914a26e056cf69207b4f50924ehkuang
184591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
184691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
184791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
184891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
184991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
185091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
185191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
185291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
185391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
185491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
185591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
185691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
185791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
185891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
185991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
186091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
186191037db265ecdd914a26e056cf69207b4f50924ehkuang
186291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[0] = _mm_add_epi16(x[0], x[2]);
186391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[1] = _mm_add_epi16(x[1], x[3]);
186491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[2] = _mm_sub_epi16(x[0], x[2]);
186591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[3] = _mm_sub_epi16(x[1], x[3]);
186691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[4] = _mm_packs_epi32(v[0], v[1]);
186791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[5] = _mm_packs_epi32(v[2], v[3]);
186891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[6] = _mm_packs_epi32(v[4], v[5]);
186991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[7] = _mm_packs_epi32(v[6], v[7]);
187091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[8] = _mm_add_epi16(x[8], x[10]);
187191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[9] = _mm_add_epi16(x[9], x[11]);
187291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[10] = _mm_sub_epi16(x[8], x[10]);
187391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[11] = _mm_sub_epi16(x[9], x[11]);
187491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[12] = _mm_packs_epi32(v[8], v[9]);
187591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[13] = _mm_packs_epi32(v[10], v[11]);
187691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[14] = _mm_packs_epi32(v[12], v[13]);
187791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[15] = _mm_packs_epi32(v[14], v[15]);
187891037db265ecdd914a26e056cf69207b4f50924ehkuang
187991037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 4
188091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
188191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
188291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
188391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
188491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
188591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
188691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
188791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
188891037db265ecdd914a26e056cf69207b4f50924ehkuang
188991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
189091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
189191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
189291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
189391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
189491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
189591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
189691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
189791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
189891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
189991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
190091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
190191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
190291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
190391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
190491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
190591037db265ecdd914a26e056cf69207b4f50924ehkuang
190691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
190791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
190891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
190991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
191091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
191191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
191291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
191391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
191491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
191591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
191691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
191791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
191891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
191991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
192091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
192191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
192291037db265ecdd914a26e056cf69207b4f50924ehkuang
192391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
192491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
192591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
192691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
192791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
192891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
192991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
193091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
193191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
193291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
193391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
193491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
193591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
193691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
193791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
193891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
193991037db265ecdd914a26e056cf69207b4f50924ehkuang
194091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = s[0];
194191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_sub_epi16(kZero, s[8]);
194291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = s[12];
194391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_sub_epi16(kZero, s[4]);
194491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = _mm_packs_epi32(v[4], v[5]);
194591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_packs_epi32(v[12], v[13]);
194691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = _mm_packs_epi32(v[8], v[9]);
194791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_packs_epi32(v[0], v[1]);
194891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[8] = _mm_packs_epi32(v[2], v[3]);
194991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[9] = _mm_packs_epi32(v[10], v[11]);
195091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[10] = _mm_packs_epi32(v[14], v[15]);
195191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[11] = _mm_packs_epi32(v[6], v[7]);
195291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[12] = s[5];
195391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[13] = _mm_sub_epi16(kZero, s[13]);
195491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[14] = s[9];
195591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[15] = _mm_sub_epi16(kZero, s[1]);
195691037db265ecdd914a26e056cf69207b4f50924ehkuang}
195791037db265ecdd914a26e056cf69207b4f50924ehkuang
19585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void idct16_1d_8col(__m128i *in) {
195991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
196091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
196191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
196291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
196391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
196491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
196591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
196691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
196791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
196891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
196991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
197091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
197191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
197291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
197391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
197491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
197591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
197691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
197791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
197891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
197991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
198091037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i v[16], u[16], s[16], t[16];
198191037db265ecdd914a26e056cf69207b4f50924ehkuang
198291037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 1
198391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[0] = in[0];
198491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[1] = in[8];
198591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[2] = in[4];
198691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[3] = in[12];
198791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[4] = in[2];
198891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[5] = in[10];
198991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[6] = in[6];
199091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[7] = in[14];
199191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[8] = in[1];
199291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[9] = in[9];
199391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[10] = in[5];
199491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[11] = in[13];
199591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[12] = in[3];
199691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[13] = in[11];
199791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[14] = in[7];
199891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[15] = in[15];
199991037db265ecdd914a26e056cf69207b4f50924ehkuang
200091037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 2
200191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(s[8], s[15]);
200291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(s[8], s[15]);
200391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(s[9], s[14]);
200491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(s[9], s[14]);
200591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_unpacklo_epi16(s[10], s[13]);
200691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_unpackhi_epi16(s[10], s[13]);
200791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_unpacklo_epi16(s[11], s[12]);
200891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_unpackhi_epi16(s[11], s[12]);
200991037db265ecdd914a26e056cf69207b4f50924ehkuang
201091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
201191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
201291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
201391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
201491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
201591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
201691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
201791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
201891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
201991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
202091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
202191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
202291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
202391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
202491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
202591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
202691037db265ecdd914a26e056cf69207b4f50924ehkuang
202791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
202891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
202991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
203091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
203191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
203291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
203391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
203491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
203591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
203691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
203791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
203891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
203991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
204091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
204191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
204291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
204391037db265ecdd914a26e056cf69207b4f50924ehkuang
204491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
204591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
204691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
204791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
204891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
204991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
205091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
205191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
205291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
205391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
205491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
205591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
205691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
205791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
205891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
205991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
206091037db265ecdd914a26e056cf69207b4f50924ehkuang
206191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[8]  = _mm_packs_epi32(u[0], u[1]);
206291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[15] = _mm_packs_epi32(u[2], u[3]);
206391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[9]  = _mm_packs_epi32(u[4], u[5]);
206491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[14] = _mm_packs_epi32(u[6], u[7]);
206591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[10] = _mm_packs_epi32(u[8], u[9]);
206691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[13] = _mm_packs_epi32(u[10], u[11]);
206791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[11] = _mm_packs_epi32(u[12], u[13]);
206891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[12] = _mm_packs_epi32(u[14], u[15]);
206991037db265ecdd914a26e056cf69207b4f50924ehkuang
207091037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 3
207191037db265ecdd914a26e056cf69207b4f50924ehkuang  t[0] = s[0];
207291037db265ecdd914a26e056cf69207b4f50924ehkuang  t[1] = s[1];
207391037db265ecdd914a26e056cf69207b4f50924ehkuang  t[2] = s[2];
207491037db265ecdd914a26e056cf69207b4f50924ehkuang  t[3] = s[3];
207591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(s[4], s[7]);
207691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(s[4], s[7]);
207791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(s[5], s[6]);
207891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(s[5], s[6]);
207991037db265ecdd914a26e056cf69207b4f50924ehkuang
208091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
208191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
208291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
208391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
208491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
208591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
208691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
208791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
208891037db265ecdd914a26e056cf69207b4f50924ehkuang
208991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
209091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
209191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
209291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
209391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
209491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
209591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
209691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
209791037db265ecdd914a26e056cf69207b4f50924ehkuang
209891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
209991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
210091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
210191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
210291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
210391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
210491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
210591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
210691037db265ecdd914a26e056cf69207b4f50924ehkuang
210791037db265ecdd914a26e056cf69207b4f50924ehkuang  t[4] = _mm_packs_epi32(u[0], u[1]);
210891037db265ecdd914a26e056cf69207b4f50924ehkuang  t[7] = _mm_packs_epi32(u[2], u[3]);
210991037db265ecdd914a26e056cf69207b4f50924ehkuang  t[5] = _mm_packs_epi32(u[4], u[5]);
211091037db265ecdd914a26e056cf69207b4f50924ehkuang  t[6] = _mm_packs_epi32(u[6], u[7]);
211191037db265ecdd914a26e056cf69207b4f50924ehkuang  t[8] = _mm_add_epi16(s[8], s[9]);
211291037db265ecdd914a26e056cf69207b4f50924ehkuang  t[9] = _mm_sub_epi16(s[8], s[9]);
211391037db265ecdd914a26e056cf69207b4f50924ehkuang  t[10] = _mm_sub_epi16(s[11], s[10]);
211491037db265ecdd914a26e056cf69207b4f50924ehkuang  t[11] = _mm_add_epi16(s[10], s[11]);
211591037db265ecdd914a26e056cf69207b4f50924ehkuang  t[12] = _mm_add_epi16(s[12], s[13]);
211691037db265ecdd914a26e056cf69207b4f50924ehkuang  t[13] = _mm_sub_epi16(s[12], s[13]);
211791037db265ecdd914a26e056cf69207b4f50924ehkuang  t[14] = _mm_sub_epi16(s[15], s[14]);
211891037db265ecdd914a26e056cf69207b4f50924ehkuang  t[15] = _mm_add_epi16(s[14], s[15]);
211991037db265ecdd914a26e056cf69207b4f50924ehkuang
212091037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 4
212191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(t[0], t[1]);
212291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(t[0], t[1]);
212391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(t[2], t[3]);
212491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(t[2], t[3]);
212591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_unpacklo_epi16(t[9], t[14]);
212691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_unpackhi_epi16(t[9], t[14]);
212791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_unpacklo_epi16(t[10], t[13]);
212891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_unpackhi_epi16(t[10], t[13]);
212991037db265ecdd914a26e056cf69207b4f50924ehkuang
213091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
213191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
213291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
213391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
213491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
213591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
213691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
213791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
213891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
213991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
214091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
214191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
214291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
214391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
214491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
214591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
214691037db265ecdd914a26e056cf69207b4f50924ehkuang
214791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
214891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
214991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
215091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
215191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
215291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
215391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
215491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
215591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
215691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
215791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
215891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
215991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
216091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
216191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
216291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
216391037db265ecdd914a26e056cf69207b4f50924ehkuang
216491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
216591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
216691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
216791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
216891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
216991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
217091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
217191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
217291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
217391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
217491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
217591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
217691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
217791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
217891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
217991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
218091037db265ecdd914a26e056cf69207b4f50924ehkuang
218191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[0] = _mm_packs_epi32(u[0], u[1]);
218291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[1] = _mm_packs_epi32(u[2], u[3]);
218391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[2] = _mm_packs_epi32(u[4], u[5]);
218491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[3] = _mm_packs_epi32(u[6], u[7]);
218591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[4] = _mm_add_epi16(t[4], t[5]);
218691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[5] = _mm_sub_epi16(t[4], t[5]);
218791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[6] = _mm_sub_epi16(t[7], t[6]);
218891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[7] = _mm_add_epi16(t[6], t[7]);
218991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[8] = t[8];
219091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[15] = t[15];
219191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[9]  = _mm_packs_epi32(u[8], u[9]);
219291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[14] = _mm_packs_epi32(u[10], u[11]);
219391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[10] = _mm_packs_epi32(u[12], u[13]);
219491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[13] = _mm_packs_epi32(u[14], u[15]);
219591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[11] = t[11];
219691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[12] = t[12];
219791037db265ecdd914a26e056cf69207b4f50924ehkuang
219891037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 5
219991037db265ecdd914a26e056cf69207b4f50924ehkuang  t[0] = _mm_add_epi16(s[0], s[3]);
220091037db265ecdd914a26e056cf69207b4f50924ehkuang  t[1] = _mm_add_epi16(s[1], s[2]);
220191037db265ecdd914a26e056cf69207b4f50924ehkuang  t[2] = _mm_sub_epi16(s[1], s[2]);
220291037db265ecdd914a26e056cf69207b4f50924ehkuang  t[3] = _mm_sub_epi16(s[0], s[3]);
220391037db265ecdd914a26e056cf69207b4f50924ehkuang  t[4] = s[4];
220491037db265ecdd914a26e056cf69207b4f50924ehkuang  t[7] = s[7];
220591037db265ecdd914a26e056cf69207b4f50924ehkuang
220691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(s[5], s[6]);
220791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(s[5], s[6]);
220891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
220991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
221091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
221191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
221291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
221391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
221491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
221591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
221691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
221791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
221891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
221991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
222091037db265ecdd914a26e056cf69207b4f50924ehkuang  t[5] = _mm_packs_epi32(u[0], u[1]);
222191037db265ecdd914a26e056cf69207b4f50924ehkuang  t[6] = _mm_packs_epi32(u[2], u[3]);
222291037db265ecdd914a26e056cf69207b4f50924ehkuang
222391037db265ecdd914a26e056cf69207b4f50924ehkuang  t[8] = _mm_add_epi16(s[8], s[11]);
222491037db265ecdd914a26e056cf69207b4f50924ehkuang  t[9] = _mm_add_epi16(s[9], s[10]);
222591037db265ecdd914a26e056cf69207b4f50924ehkuang  t[10] = _mm_sub_epi16(s[9], s[10]);
222691037db265ecdd914a26e056cf69207b4f50924ehkuang  t[11] = _mm_sub_epi16(s[8], s[11]);
222791037db265ecdd914a26e056cf69207b4f50924ehkuang  t[12] = _mm_sub_epi16(s[15], s[12]);
222891037db265ecdd914a26e056cf69207b4f50924ehkuang  t[13] = _mm_sub_epi16(s[14], s[13]);
222991037db265ecdd914a26e056cf69207b4f50924ehkuang  t[14] = _mm_add_epi16(s[13], s[14]);
223091037db265ecdd914a26e056cf69207b4f50924ehkuang  t[15] = _mm_add_epi16(s[12], s[15]);
223191037db265ecdd914a26e056cf69207b4f50924ehkuang
223291037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 6
223391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[0] = _mm_add_epi16(t[0], t[7]);
223491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[1] = _mm_add_epi16(t[1], t[6]);
223591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[2] = _mm_add_epi16(t[2], t[5]);
223691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[3] = _mm_add_epi16(t[3], t[4]);
223791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[4] = _mm_sub_epi16(t[3], t[4]);
223891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[5] = _mm_sub_epi16(t[2], t[5]);
223991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[6] = _mm_sub_epi16(t[1], t[6]);
224091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[7] = _mm_sub_epi16(t[0], t[7]);
224191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[8] = t[8];
224291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[9] = t[9];
224391037db265ecdd914a26e056cf69207b4f50924ehkuang
224491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(t[10], t[13]);
224591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(t[10], t[13]);
224691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(t[11], t[12]);
224791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(t[11], t[12]);
224891037db265ecdd914a26e056cf69207b4f50924ehkuang
224991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
225091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
225191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
225291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
225391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
225491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
225591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
225691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
225791037db265ecdd914a26e056cf69207b4f50924ehkuang
225891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
225991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
226091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
226191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
226291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
226391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
226491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
226591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
226691037db265ecdd914a26e056cf69207b4f50924ehkuang
226791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
226891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
226991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
227091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
227191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
227291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
227391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
227491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
227591037db265ecdd914a26e056cf69207b4f50924ehkuang
227691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[10] = _mm_packs_epi32(u[0], u[1]);
227791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[13] = _mm_packs_epi32(u[2], u[3]);
227891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[11] = _mm_packs_epi32(u[4], u[5]);
227991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[12] = _mm_packs_epi32(u[6], u[7]);
228091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[14] = t[14];
228191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[15] = t[15];
228291037db265ecdd914a26e056cf69207b4f50924ehkuang
228391037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 7
228491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_add_epi16(s[0], s[15]);
228591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_add_epi16(s[1], s[14]);
228691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_add_epi16(s[2], s[13]);
228791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_add_epi16(s[3], s[12]);
228891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = _mm_add_epi16(s[4], s[11]);
228991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_add_epi16(s[5], s[10]);
229091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = _mm_add_epi16(s[6], s[9]);
229191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_add_epi16(s[7], s[8]);
229291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[8] = _mm_sub_epi16(s[7], s[8]);
229391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[9] = _mm_sub_epi16(s[6], s[9]);
229491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[10] = _mm_sub_epi16(s[5], s[10]);
229591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[11] = _mm_sub_epi16(s[4], s[11]);
229691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[12] = _mm_sub_epi16(s[3], s[12]);
229791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[13] = _mm_sub_epi16(s[2], s[13]);
229891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[14] = _mm_sub_epi16(s[1], s[14]);
229991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[15] = _mm_sub_epi16(s[0], s[15]);
230091037db265ecdd914a26e056cf69207b4f50924ehkuang}
230191037db265ecdd914a26e056cf69207b4f50924ehkuang
23025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void idct16_1d_sse2(__m128i *in0, __m128i *in1) {
230391037db265ecdd914a26e056cf69207b4f50924ehkuang  array_transpose_16x16(in0, in1);
230491037db265ecdd914a26e056cf69207b4f50924ehkuang  idct16_1d_8col(in0);
230591037db265ecdd914a26e056cf69207b4f50924ehkuang  idct16_1d_8col(in1);
230691037db265ecdd914a26e056cf69207b4f50924ehkuang}
230791037db265ecdd914a26e056cf69207b4f50924ehkuang
23085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void iadst16_1d_sse2(__m128i *in0, __m128i *in1) {
230991037db265ecdd914a26e056cf69207b4f50924ehkuang  array_transpose_16x16(in0, in1);
231091037db265ecdd914a26e056cf69207b4f50924ehkuang  iadst16_1d_8col(in0);
231191037db265ecdd914a26e056cf69207b4f50924ehkuang  iadst16_1d_8col(in1);
231291037db265ecdd914a26e056cf69207b4f50924ehkuang}
231391037db265ecdd914a26e056cf69207b4f50924ehkuang
23145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
23155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * 16));
23165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * 16));
23175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * 16));
23185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * 16));
23195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * 16));
23205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * 16));
23215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * 16));
23225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * 16));
23235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
23245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[8]  = _mm_load_si128((const __m128i *)(input + 8 * 16));
23255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[9]  = _mm_load_si128((const __m128i *)(input + 9 * 16));
23265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[10]  = _mm_load_si128((const __m128i *)(input + 10 * 16));
23275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[11]  = _mm_load_si128((const __m128i *)(input + 11 * 16));
23285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[12]  = _mm_load_si128((const __m128i *)(input + 12 * 16));
23295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[13]  = _mm_load_si128((const __m128i *)(input + 13 * 16));
23305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[14]  = _mm_load_si128((const __m128i *)(input + 14 * 16));
23315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in[15]  = _mm_load_si128((const __m128i *)(input + 15 * 16));
233291037db265ecdd914a26e056cf69207b4f50924ehkuang}
233391037db265ecdd914a26e056cf69207b4f50924ehkuang
233491037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
233591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i final_rounding = _mm_set1_epi16(1<<5);
233691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i zero = _mm_setzero_si128();
233791037db265ecdd914a26e056cf69207b4f50924ehkuang  // Final rounding and shift
233891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_adds_epi16(in[0], final_rounding);
233991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_adds_epi16(in[1], final_rounding);
234091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_adds_epi16(in[2], final_rounding);
234191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_adds_epi16(in[3], final_rounding);
234291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = _mm_adds_epi16(in[4], final_rounding);
234391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_adds_epi16(in[5], final_rounding);
234491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = _mm_adds_epi16(in[6], final_rounding);
234591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_adds_epi16(in[7], final_rounding);
234691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[8] = _mm_adds_epi16(in[8], final_rounding);
234791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[9] = _mm_adds_epi16(in[9], final_rounding);
234891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[10] = _mm_adds_epi16(in[10], final_rounding);
234991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[11] = _mm_adds_epi16(in[11], final_rounding);
235091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[12] = _mm_adds_epi16(in[12], final_rounding);
235191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[13] = _mm_adds_epi16(in[13], final_rounding);
235291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[14] = _mm_adds_epi16(in[14], final_rounding);
235391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[15] = _mm_adds_epi16(in[15], final_rounding);
235491037db265ecdd914a26e056cf69207b4f50924ehkuang
235591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_srai_epi16(in[0], 6);
235691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_srai_epi16(in[1], 6);
235791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_srai_epi16(in[2], 6);
235891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_srai_epi16(in[3], 6);
235991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = _mm_srai_epi16(in[4], 6);
236091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_srai_epi16(in[5], 6);
236191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = _mm_srai_epi16(in[6], 6);
236291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_srai_epi16(in[7], 6);
236391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[8] = _mm_srai_epi16(in[8], 6);
236491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[9] = _mm_srai_epi16(in[9], 6);
236591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[10] = _mm_srai_epi16(in[10], 6);
236691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[11] = _mm_srai_epi16(in[11], 6);
236791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[12] = _mm_srai_epi16(in[12], 6);
236891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[13] = _mm_srai_epi16(in[13], 6);
236991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[14] = _mm_srai_epi16(in[14], 6);
237091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[15] = _mm_srai_epi16(in[15], 6);
237191037db265ecdd914a26e056cf69207b4f50924ehkuang
237291037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[0]);
237391037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[1]);
237491037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[2]);
237591037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[3]);
237691037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[4]);
237791037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[5]);
237891037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[6]);
237991037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[7]);
238091037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[8]);
238191037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[9]);
238291037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[10]);
238391037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[11]);
238491037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[12]);
238591037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[13]);
238691037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[14]);
238791037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[15]);
238891037db265ecdd914a26e056cf69207b4f50924ehkuang}
238991037db265ecdd914a26e056cf69207b4f50924ehkuang
23905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
23915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                               int tx_type) {
239291037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i in0[16], in1[16];
239391037db265ecdd914a26e056cf69207b4f50924ehkuang
239491037db265ecdd914a26e056cf69207b4f50924ehkuang  load_buffer_8x16(input, in0);
239591037db265ecdd914a26e056cf69207b4f50924ehkuang  input += 8;
239691037db265ecdd914a26e056cf69207b4f50924ehkuang  load_buffer_8x16(input, in1);
239791037db265ecdd914a26e056cf69207b4f50924ehkuang
239891037db265ecdd914a26e056cf69207b4f50924ehkuang  switch (tx_type) {
239991037db265ecdd914a26e056cf69207b4f50924ehkuang    case 0:  // DCT_DCT
240091037db265ecdd914a26e056cf69207b4f50924ehkuang      idct16_1d_sse2(in0, in1);
240191037db265ecdd914a26e056cf69207b4f50924ehkuang      idct16_1d_sse2(in0, in1);
240291037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
240391037db265ecdd914a26e056cf69207b4f50924ehkuang    case 1:  // ADST_DCT
240491037db265ecdd914a26e056cf69207b4f50924ehkuang      idct16_1d_sse2(in0, in1);
240591037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst16_1d_sse2(in0, in1);
240691037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
240791037db265ecdd914a26e056cf69207b4f50924ehkuang    case 2:  // DCT_ADST
240891037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst16_1d_sse2(in0, in1);
240991037db265ecdd914a26e056cf69207b4f50924ehkuang      idct16_1d_sse2(in0, in1);
241091037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
241191037db265ecdd914a26e056cf69207b4f50924ehkuang    case 3:  // ADST_ADST
241291037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst16_1d_sse2(in0, in1);
241391037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst16_1d_sse2(in0, in1);
241491037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
241591037db265ecdd914a26e056cf69207b4f50924ehkuang    default:
241691037db265ecdd914a26e056cf69207b4f50924ehkuang      assert(0);
241791037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
241891037db265ecdd914a26e056cf69207b4f50924ehkuang  }
241991037db265ecdd914a26e056cf69207b4f50924ehkuang
242091037db265ecdd914a26e056cf69207b4f50924ehkuang  write_buffer_8x16(dest, in0, stride);
242191037db265ecdd914a26e056cf69207b4f50924ehkuang  dest += 8;
242291037db265ecdd914a26e056cf69207b4f50924ehkuang  write_buffer_8x16(dest, in1, stride);
242391037db265ecdd914a26e056cf69207b4f50924ehkuang}
242491037db265ecdd914a26e056cf69207b4f50924ehkuang
24255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
24265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                               int stride) {
2427ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
2428ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i final_rounding = _mm_set1_epi16(1<<5);
2429ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i zero = _mm_setzero_si128();
2430ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2431ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2432ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
2433ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
2434ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
2435ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
2436ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
2437ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2438ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
2439ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2440ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2441ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
2442ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
2443ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
2444ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2445ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
2446ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2447ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
2448ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
2449ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2450ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
2451ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
2452ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2453ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2454ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
2455ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2456ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero,
2457ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero,
2458ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          in10 = zero, in11 = zero, in12 = zero, in13 = zero,
2459ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          in14 = zero, in15 = zero;
2460ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero,
2461ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero,
2462ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          l12 = zero, l13 = zero, l14 = zero, l15 = zero;
2463ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2464ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
2465ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
2466ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_8_0, stp1_12_0;
2467ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
2468ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
2469ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2470ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int i;
2471ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // 1-D idct. Load input data.
24725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in0 = _mm_load_si128((const __m128i *)input);
24735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in8 = _mm_load_si128((const __m128i *)(input + 8 * 1));
24745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in1 = _mm_load_si128((const __m128i *)(input + 8 * 2));
24755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in9 = _mm_load_si128((const __m128i *)(input + 8 * 3));
24765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in2 = _mm_load_si128((const __m128i *)(input + 8 * 4));
24775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in10 = _mm_load_si128((const __m128i *)(input + 8 * 5));
24785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in3 = _mm_load_si128((const __m128i *)(input + 8 * 6));
24795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  in11 = _mm_load_si128((const __m128i *)(input + 8 * 7));
2480ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2481ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3);
2482ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11);
2483ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2484ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage2
2485ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {
2486ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_1_15 = _mm_unpackhi_epi16(in0, in11);
2487ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_9_7 = _mm_unpackhi_epi16(in8, in3);
2488ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_5_11 = _mm_unpackhi_epi16(in2, in9);
2489ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_13_3 = _mm_unpackhi_epi16(in10, in1);
2490ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2491ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
2492ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
2493ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_madd_epi16(lo_9_7, stg2_2);
2494ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_madd_epi16(lo_9_7, stg2_3);
2495ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_madd_epi16(lo_5_11, stg2_4);
2496ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_madd_epi16(lo_5_11, stg2_5);
2497ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
2498ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
2499ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2500ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding);
2501ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding);
2502ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_add_epi32(tmp4, rounding);
2503ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_add_epi32(tmp6, rounding);
2504ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_add_epi32(tmp1, rounding);
2505ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_add_epi32(tmp3, rounding);
2506ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp5 = _mm_add_epi32(tmp5, rounding);
2507ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp7 = _mm_add_epi32(tmp7, rounding);
2508ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2509ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2510ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2511ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2512ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2513ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2514ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2515ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2516ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2517ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2518ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_8 = _mm_packs_epi32(tmp0, zero);
2519ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_15 = _mm_packs_epi32(tmp2, zero);
2520ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_9 = _mm_packs_epi32(tmp4, zero);
2521ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_14 = _mm_packs_epi32(tmp6, zero);
2522ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2523ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_10 = _mm_packs_epi32(tmp1, zero);
2524ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_13 = _mm_packs_epi32(tmp3, zero);
2525ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_11 = _mm_packs_epi32(tmp5, zero);
2526ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_12 = _mm_packs_epi32(tmp7, zero);
2527ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
2528ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2529ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage3
2530ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {
2531ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_2_14 = _mm_unpacklo_epi16(in1, in11);
2532ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_10_6 = _mm_unpacklo_epi16(in9, in3);
2533ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2534ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
2535ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
2536ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_madd_epi16(lo_10_6, stg3_2);
2537ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_madd_epi16(lo_10_6, stg3_3);
2538ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2539ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding);
2540ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding);
2541ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_add_epi32(tmp4, rounding);
2542ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_add_epi32(tmp6, rounding);
2543ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2544ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2545ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2546ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2547ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2548ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2549ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_4 = _mm_packs_epi32(tmp0, zero);
2550ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_7 = _mm_packs_epi32(tmp2, zero);
2551ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_5 = _mm_packs_epi32(tmp4, zero);
2552ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_6 = _mm_packs_epi32(tmp6, zero);
2553ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2554ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);
2555ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);
2556ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);
2557ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_11 = _mm_add_epi16(stp2_11, stp2_10);
2558ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2559ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13);
2560ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);
2561ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);
2562ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_15 = _mm_add_epi16(stp2_15, stp2_14);
2563ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
2564ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2565ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage4
2566ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {
2567ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8);
2568ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_4_12 = _mm_unpacklo_epi16(in2, in10);
2569ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
2570ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
2571ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2572ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
2573ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
2574ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_madd_epi16(lo_4_12, stg4_2);
2575ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_madd_epi16(lo_4_12, stg4_3);
2576ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
2577ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
2578ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
2579ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
2580ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2581ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding);
2582ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding);
2583ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_add_epi32(tmp4, rounding);
2584ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_add_epi32(tmp6, rounding);
2585ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_add_epi32(tmp1, rounding);
2586ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_add_epi32(tmp3, rounding);
2587ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp5 = _mm_add_epi32(tmp5, rounding);
2588ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp7 = _mm_add_epi32(tmp7, rounding);
2589ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2590ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2591ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2592ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2593ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2594ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2595ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2596ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2597ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2598ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2599ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_0 = _mm_packs_epi32(tmp0, zero);
2600ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_1 = _mm_packs_epi32(tmp2, zero);
2601ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_2 = _mm_packs_epi32(tmp4, zero);
2602ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_3 = _mm_packs_epi32(tmp6, zero);
2603ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_9 = _mm_packs_epi32(tmp1, zero);
2604ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_14 = _mm_packs_epi32(tmp3, zero);
2605ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_10 = _mm_packs_epi32(tmp5, zero);
2606ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_13 = _mm_packs_epi32(tmp7, zero);
2607ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2608ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
2609ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
2610ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
2611ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
2612ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
2613ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2614ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage5 and Stage6
2615ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {
2616ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
2617ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
2618ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
2619ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
2620ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2621ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);
2622ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
2623ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
2624ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);
2625ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2626ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);
2627ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
2628ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
2629ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);
2630ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
2631ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2632ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage6
2633ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {
2634ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);
2635ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
2636ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
2637ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2638ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
2639ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
2640ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
2641ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
2642ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
2643ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
2644ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2645ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_add_epi32(tmp1, rounding);
2646ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_add_epi32(tmp3, rounding);
2647ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding);
2648ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding);
2649ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_add_epi32(tmp4, rounding);
2650ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_add_epi32(tmp6, rounding);
2651ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2652ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2653ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2654ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2655ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2656ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2657ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2658ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2659ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_5 = _mm_packs_epi32(tmp1, zero);
2660ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_6 = _mm_packs_epi32(tmp3, zero);
2661ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_10 = _mm_packs_epi32(tmp0, zero);
2662ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_13 = _mm_packs_epi32(tmp2, zero);
2663ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_11 = _mm_packs_epi32(tmp4, zero);
2664ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_12 = _mm_packs_epi32(tmp6, zero);
2665ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2666ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);
2667ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);
2668ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
2669ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);
2670ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);
2671ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
2672ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);
2673ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);
2674ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
2675ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2676ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage7. Left 8x16 only.
2677ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l0 = _mm_add_epi16(stp2_0, stp1_15);
2678ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l1 = _mm_add_epi16(stp2_1, stp1_14);
2679ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l2 = _mm_add_epi16(stp2_2, stp2_13);
2680ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l3 = _mm_add_epi16(stp2_3, stp2_12);
2681ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l4 = _mm_add_epi16(stp2_4, stp2_11);
2682ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l5 = _mm_add_epi16(stp2_5, stp2_10);
2683ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l6 = _mm_add_epi16(stp2_6, stp1_9);
2684ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l7 = _mm_add_epi16(stp2_7, stp1_8);
2685ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l8 = _mm_sub_epi16(stp2_7, stp1_8);
2686ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l9 = _mm_sub_epi16(stp2_6, stp1_9);
2687ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l10 = _mm_sub_epi16(stp2_5, stp2_10);
2688ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l11 = _mm_sub_epi16(stp2_4, stp2_11);
2689ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l12 = _mm_sub_epi16(stp2_3, stp2_12);
2690ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l13 = _mm_sub_epi16(stp2_2, stp2_13);
2691ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l14 = _mm_sub_epi16(stp2_1, stp1_14);
2692ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l15 = _mm_sub_epi16(stp2_0, stp1_15);
2693ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2694ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // 2-D idct. We do 2 8x16 blocks.
2695ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 2; i++) {
2696ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    if (i == 0)
2697ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_4X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4,
2698ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in5, in6, in7);
2699ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2700ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    if (i == 1)
2701ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_4X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3,
2702ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in4, in5, in6, in7);
2703ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2704ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero;
2705ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
27065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    IDCT16_1D
2707ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2708ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // Stage7
2709ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in0 = _mm_add_epi16(stp2_0, stp1_15);
2710ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in1 = _mm_add_epi16(stp2_1, stp1_14);
2711ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in2 = _mm_add_epi16(stp2_2, stp2_13);
2712ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in3 = _mm_add_epi16(stp2_3, stp2_12);
2713ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in4 = _mm_add_epi16(stp2_4, stp2_11);
2714ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in5 = _mm_add_epi16(stp2_5, stp2_10);
2715ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in6 = _mm_add_epi16(stp2_6, stp1_9);
2716ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in7 = _mm_add_epi16(stp2_7, stp1_8);
2717ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in8 = _mm_sub_epi16(stp2_7, stp1_8);
2718ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in9 = _mm_sub_epi16(stp2_6, stp1_9);
2719ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in10 = _mm_sub_epi16(stp2_5, stp2_10);
2720ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in11 = _mm_sub_epi16(stp2_4, stp2_11);
2721ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in12 = _mm_sub_epi16(stp2_3, stp2_12);
2722ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in13 = _mm_sub_epi16(stp2_2, stp2_13);
2723ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in14 = _mm_sub_epi16(stp2_1, stp1_14);
2724ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in15 = _mm_sub_epi16(stp2_0, stp1_15);
2725ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2726ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // Final rounding and shift
2727ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in0 = _mm_adds_epi16(in0, final_rounding);
2728ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in1 = _mm_adds_epi16(in1, final_rounding);
2729ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in2 = _mm_adds_epi16(in2, final_rounding);
2730ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in3 = _mm_adds_epi16(in3, final_rounding);
2731ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in4 = _mm_adds_epi16(in4, final_rounding);
2732ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in5 = _mm_adds_epi16(in5, final_rounding);
2733ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in6 = _mm_adds_epi16(in6, final_rounding);
2734ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in7 = _mm_adds_epi16(in7, final_rounding);
2735ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in8 = _mm_adds_epi16(in8, final_rounding);
2736ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in9 = _mm_adds_epi16(in9, final_rounding);
2737ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in10 = _mm_adds_epi16(in10, final_rounding);
2738ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in11 = _mm_adds_epi16(in11, final_rounding);
2739ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in12 = _mm_adds_epi16(in12, final_rounding);
2740ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in13 = _mm_adds_epi16(in13, final_rounding);
2741ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in14 = _mm_adds_epi16(in14, final_rounding);
2742ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in15 = _mm_adds_epi16(in15, final_rounding);
2743ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2744ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in0 = _mm_srai_epi16(in0, 6);
2745ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in1 = _mm_srai_epi16(in1, 6);
2746ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in2 = _mm_srai_epi16(in2, 6);
2747ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in3 = _mm_srai_epi16(in3, 6);
2748ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in4 = _mm_srai_epi16(in4, 6);
2749ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in5 = _mm_srai_epi16(in5, 6);
2750ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in6 = _mm_srai_epi16(in6, 6);
2751ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in7 = _mm_srai_epi16(in7, 6);
2752ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in8 = _mm_srai_epi16(in8, 6);
2753ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in9 = _mm_srai_epi16(in9, 6);
2754ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in10 = _mm_srai_epi16(in10, 6);
2755ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in11 = _mm_srai_epi16(in11, 6);
2756ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in12 = _mm_srai_epi16(in12, 6);
2757ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in13 = _mm_srai_epi16(in13, 6);
2758ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in14 = _mm_srai_epi16(in14, 6);
2759ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in15 = _mm_srai_epi16(in15, 6);
2760ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2761ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in0);
2762ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in1);
2763ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in2);
2764ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in3);
2765ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in4);
2766ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in5);
2767ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in6);
2768ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in7);
2769ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in8);
2770ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in9);
2771ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in10);
2772ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in11);
2773ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in12);
2774ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in13);
2775ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in14);
2776ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in15);
2777ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2778ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    dest += 8 - (stride * 16);
2779ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
2780ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
2781ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2782f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang#define LOAD_DQCOEFF(reg, input) \
2783f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  {  \
27845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    reg = _mm_load_si128((const __m128i *) input); \
2785f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    input += 8; \
2786f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  }  \
2787f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
27885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#define IDCT32_1D \
27895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/* Stage1 */ \
27905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang{ \
27915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); \
27925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); \
27935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); \
27945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); \
27955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
27965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); \
27975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); \
27985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); \
27995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); \
28005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
28015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); \
28025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); \
28035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); \
28045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); \
28055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
28065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); \
28075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); \
28085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); \
28095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); \
28105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
28115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
28125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
28135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_17, stp1_30) \
28145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
28155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
28165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_19, stp1_28) \
28175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
28185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
28195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_21, stp1_26) \
28205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
28215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
28225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_23, stp1_24) \
28235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} \
28245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang\
28255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/* Stage2 */ \
28265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang{ \
28275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); \
28285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); \
28295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); \
28305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); \
28315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
28325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); \
28335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); \
28345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); \
28355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); \
28365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
28375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
28385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
28395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp2_14) \
28405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
28415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
28425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp2_11, stp2_12) \
28435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
28445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
28455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
28465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
28475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
28485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
28495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
28505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
28515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
28525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
28535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
28545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
28555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
28565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
28575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
28585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
28595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
28605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
28615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
28625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
28635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} \
28645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang\
28655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/* Stage3 */ \
28665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang{ \
28675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); \
28685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); \
28695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); \
28705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); \
28715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
28725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
28735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
28745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
28755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
28765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
28775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
28785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
28795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
28805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
28815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
28825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
28835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
28845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_6) \
28855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
28865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
28875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
28885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
28895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
28905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
28915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
28925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
28935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
28945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
28955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
28965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
28975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_18, stp1_29) \
28985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
28995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
29005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_22, stp1_25) \
29015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
29025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_16 = stp2_16; \
29035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_31 = stp2_31; \
29045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_19 = stp2_19; \
29055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_20 = stp2_20; \
29065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_23 = stp2_23; \
29075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_24 = stp2_24; \
29085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_27 = stp2_27; \
29095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_28 = stp2_28; \
29105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} \
29115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang\
29125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/* Stage4 */ \
29135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang{ \
29145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); \
29155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); \
29165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); \
29175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); \
29185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
29195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
29205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
29215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
29225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
29235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
29245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
29255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
29265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp2_2, stp2_3) \
29275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
29285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
29295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
29305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
29315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
29325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
29335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
29345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
29355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp2_10, stp2_13) \
29365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
29375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_8 = stp1_8; \
29385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_15 = stp1_15; \
29395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_11 = stp1_11; \
29405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_12 = stp1_12; \
29415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
29425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
29435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
29445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
29455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
29465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
29475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
29485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
29495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
29505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
29515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
29525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
29535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
29545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
29555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
29565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
29575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
29585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
29595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} \
29605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang\
29615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/* Stage5 */ \
29625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang{ \
29635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
29645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
29655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
29665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
29675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
29685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
29695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
29705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
29715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
29725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
29735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
29745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
29755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
29765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
29775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
29785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
29795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
29805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
29815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
29825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
29835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
29845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
29855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
29865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp0 = _mm_add_epi32(tmp0, rounding); \
29875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp1 = _mm_add_epi32(tmp1, rounding); \
29885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp2 = _mm_add_epi32(tmp2, rounding); \
29895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp3 = _mm_add_epi32(tmp3, rounding); \
29905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
29915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
29925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
29935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
29945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
29955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
29965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
29975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
29985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
29995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_4 = stp2_4; \
30005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_7 = stp2_7; \
30015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
30025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
30035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
30045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
30055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
30065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
30075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
30085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
30095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
30105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
30115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_16 = stp2_16; \
30125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_17 = stp2_17; \
30135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
30145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
30155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
30165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_19, stp1_28) \
30175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
30185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
30195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_21, stp1_26) \
30205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
30215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_22 = stp2_22; \
30225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_23 = stp2_23; \
30235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_24 = stp2_24; \
30245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_25 = stp2_25; \
30255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_30 = stp2_30; \
30265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_31 = stp2_31; \
30275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} \
30285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang\
30295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/* Stage6 */ \
30305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang{ \
30315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
30325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
30335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
30345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
30355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
30365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
30375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
30385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
30395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
30405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
30415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
30425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
30435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
30445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
30455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_8 = stp1_8; \
30465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_9 = stp1_9; \
30475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_14 = stp1_14; \
30485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_15 = stp1_15; \
30495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
30505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
30515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
30525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp2_13, stp2_11, stp2_12) \
30535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
30545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
30555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
30565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
30575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
30585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
30595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
30605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
30615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
30625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
30635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
30645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
30655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
30665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
30675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
30685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
30695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
30705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
30715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} \
30725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang\
30735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/* Stage7 */ \
30745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang{ \
30755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
30765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
30775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
30785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
30795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
30805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
30815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
30825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
30835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
30845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
30855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
30865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
30875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
30885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
30895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
30905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
30915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
30925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
30935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
30945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
30955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
30965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
30975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
30985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
30995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
31005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
31015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
31025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_16 = stp2_16; \
31035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_17 = stp2_17; \
31045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_18 = stp2_18; \
31055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_19 = stp2_19; \
31065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
31075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
31085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
31095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_21, stp1_26) \
31105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
31115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
31125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         stp1_23, stp1_24) \
31135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  \
31145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_28 = stp2_28; \
31155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_29 = stp2_29; \
31165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_30 = stp2_30; \
31175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  stp1_31 = stp2_31; \
31185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
31195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
31205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang// Only upper-left 8x8 has non-zero coeff
31215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
31225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                 int stride) {
31235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
31245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i final_rounding = _mm_set1_epi16(1<<5);
31255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
31265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  // idct constants for each stage
31275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
31285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
31295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
31305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
31315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
31325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
31335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
31345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
31355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
31365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
31375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
31385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
31395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
31405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
31415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
31425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
31435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
31445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
31455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
31465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
31475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
31485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
31495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
31505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
31515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
31525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
31535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
31545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
31555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
31565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
31575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
31585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
31595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
31605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
31615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
31625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
31635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
31645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
31655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
31665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
31675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
31685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
31695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
31705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
31715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
31725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
31735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
31745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
31755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
31765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          in24, in25, in26, in27, in28, in29, in30, in31;
31775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  __m128i col[128];
31785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
31795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
31805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
31815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
31825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          stp1_30, stp1_31;
31835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
31845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
31855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
31865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
31875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          stp2_30, stp2_31;
31885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
31895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int i, j, i32;
31905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
31915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
31925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (i = 0; i < 8; i++) {
31935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    i32 = (i << 5);
31945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    if (i == 0) {
31955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      // First 1-D idct: first 8 rows
31965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      // Load input data.
31975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in0, input);
31985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in8, input);
31995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in16, input);
32005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in24, input);
32015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in1, input);
32025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in9, input);
32035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in17, input);
32045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in25, input);
32055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in2, input);
32065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in10, input);
32075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in18, input);
32085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in26, input);
32095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in3, input);
32105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in11, input);
32115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in19, input);
32125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in27, input);
32135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
32145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in4, input);
32155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in12, input);
32165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in20, input);
32175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in28, input);
32185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in5, input);
32195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in13, input);
32205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in21, input);
32215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in29, input);
32225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in6, input);
32235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in14, input);
32245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in22, input);
32255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in30, input);
32265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in7, input);
32275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in15, input);
32285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in23, input);
32295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      LOAD_DQCOEFF(in31, input);
32305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
32315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      // Transpose 32x8 block to 8x32 block
32325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
32335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    in4, in5, in6, in7);
32345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
32355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    in10, in11, in12, in13, in14, in15);
32365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
32375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    in18, in19, in20, in21, in22, in23);
32385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
32395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    in26, in27, in28, in29, in30, in31);
32405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    } else if (i < 4) {
32415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      // First 1-D idct: next 24 zero-coeff rows
32425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 0] = _mm_setzero_si128();
32435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 1] = _mm_setzero_si128();
32445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 2] = _mm_setzero_si128();
32455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 3] = _mm_setzero_si128();
32465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 4] = _mm_setzero_si128();
32475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 5] = _mm_setzero_si128();
32485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 6] = _mm_setzero_si128();
32495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 7] = _mm_setzero_si128();
32505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 8] = _mm_setzero_si128();
32515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 9] = _mm_setzero_si128();
32525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 10] = _mm_setzero_si128();
32535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 11] = _mm_setzero_si128();
32545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 12] = _mm_setzero_si128();
32555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 13] = _mm_setzero_si128();
32565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 14] = _mm_setzero_si128();
32575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 15] = _mm_setzero_si128();
32585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 16] = _mm_setzero_si128();
32595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 17] = _mm_setzero_si128();
32605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 18] = _mm_setzero_si128();
32615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 19] = _mm_setzero_si128();
32625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 20] = _mm_setzero_si128();
32635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 21] = _mm_setzero_si128();
32645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 22] = _mm_setzero_si128();
32655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 23] = _mm_setzero_si128();
32665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 24] = _mm_setzero_si128();
32675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 25] = _mm_setzero_si128();
32685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 26] = _mm_setzero_si128();
32695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 27] = _mm_setzero_si128();
32705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 28] = _mm_setzero_si128();
32715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 29] = _mm_setzero_si128();
32725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 30] = _mm_setzero_si128();
32735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 31] = _mm_setzero_si128();
32745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      continue;
32755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    } else {
32765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      // Second 1-D idct
32775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      j = i - 4;
32785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
32795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      // Transpose 32x8 block to 8x32 block
32805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
32815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
32825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
32835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    in5, in6, in7);
32845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      j += 4;
32855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
32865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
32875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
32885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    in11, in12, in13, in14, in15);
32895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      j += 4;
32905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
32915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
32925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
32935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    in19, in20, in21, in22, in23);
32945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      j += 4;
32955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
32965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
32975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
32985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    in28, in29, in30, in31);
32995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
33005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
33015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    IDCT32_1D
33025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
33035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    // final stage
33045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    if (i < 4) {
33055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      // 1_D: Store 32 intermediate results for each 8x32 block.
33065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
33075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
33085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
33095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
33105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
33115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
33125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
33135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
33145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
33155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
33165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
33175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
33185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
33195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
33205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
33215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
33225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
33235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
33245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
33255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
33265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
33275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
33285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
33295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
33305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
33315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
33325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
33335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
33345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
33355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
33365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
33375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
33385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    } else {
33395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      const __m128i zero = _mm_setzero_si128();
33405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
33415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      // 2_D: Calculate the results and store them to destination.
33425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in0 = _mm_add_epi16(stp1_0, stp1_31);
33435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in1 = _mm_add_epi16(stp1_1, stp1_30);
33445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in2 = _mm_add_epi16(stp1_2, stp1_29);
33455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in3 = _mm_add_epi16(stp1_3, stp1_28);
33465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in4 = _mm_add_epi16(stp1_4, stp1_27);
33475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in5 = _mm_add_epi16(stp1_5, stp1_26);
33485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in6 = _mm_add_epi16(stp1_6, stp1_25);
33495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in7 = _mm_add_epi16(stp1_7, stp1_24);
33505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in8 = _mm_add_epi16(stp1_8, stp1_23);
33515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in9 = _mm_add_epi16(stp1_9, stp1_22);
33525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in10 = _mm_add_epi16(stp1_10, stp1_21);
33535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in11 = _mm_add_epi16(stp1_11, stp1_20);
33545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in12 = _mm_add_epi16(stp1_12, stp1_19);
33555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in13 = _mm_add_epi16(stp1_13, stp1_18);
33565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in14 = _mm_add_epi16(stp1_14, stp1_17);
33575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in15 = _mm_add_epi16(stp1_15, stp1_16);
33585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in16 = _mm_sub_epi16(stp1_15, stp1_16);
33595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in17 = _mm_sub_epi16(stp1_14, stp1_17);
33605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in18 = _mm_sub_epi16(stp1_13, stp1_18);
33615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in19 = _mm_sub_epi16(stp1_12, stp1_19);
33625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in20 = _mm_sub_epi16(stp1_11, stp1_20);
33635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in21 = _mm_sub_epi16(stp1_10, stp1_21);
33645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in22 = _mm_sub_epi16(stp1_9, stp1_22);
33655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in23 = _mm_sub_epi16(stp1_8, stp1_23);
33665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in24 = _mm_sub_epi16(stp1_7, stp1_24);
33675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in25 = _mm_sub_epi16(stp1_6, stp1_25);
33685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in26 = _mm_sub_epi16(stp1_5, stp1_26);
33695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in27 = _mm_sub_epi16(stp1_4, stp1_27);
33705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in28 = _mm_sub_epi16(stp1_3, stp1_28);
33715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in29 = _mm_sub_epi16(stp1_2, stp1_29);
33725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in30 = _mm_sub_epi16(stp1_1, stp1_30);
33735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in31 = _mm_sub_epi16(stp1_0, stp1_31);
33745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
33755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      // Final rounding and shift
33765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in0 = _mm_adds_epi16(in0, final_rounding);
33775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in1 = _mm_adds_epi16(in1, final_rounding);
33785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in2 = _mm_adds_epi16(in2, final_rounding);
33795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in3 = _mm_adds_epi16(in3, final_rounding);
33805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in4 = _mm_adds_epi16(in4, final_rounding);
33815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in5 = _mm_adds_epi16(in5, final_rounding);
33825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in6 = _mm_adds_epi16(in6, final_rounding);
33835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in7 = _mm_adds_epi16(in7, final_rounding);
33845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in8 = _mm_adds_epi16(in8, final_rounding);
33855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in9 = _mm_adds_epi16(in9, final_rounding);
33865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in10 = _mm_adds_epi16(in10, final_rounding);
33875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in11 = _mm_adds_epi16(in11, final_rounding);
33885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in12 = _mm_adds_epi16(in12, final_rounding);
33895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in13 = _mm_adds_epi16(in13, final_rounding);
33905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in14 = _mm_adds_epi16(in14, final_rounding);
33915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in15 = _mm_adds_epi16(in15, final_rounding);
33925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in16 = _mm_adds_epi16(in16, final_rounding);
33935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in17 = _mm_adds_epi16(in17, final_rounding);
33945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in18 = _mm_adds_epi16(in18, final_rounding);
33955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in19 = _mm_adds_epi16(in19, final_rounding);
33965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in20 = _mm_adds_epi16(in20, final_rounding);
33975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in21 = _mm_adds_epi16(in21, final_rounding);
33985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in22 = _mm_adds_epi16(in22, final_rounding);
33995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in23 = _mm_adds_epi16(in23, final_rounding);
34005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in24 = _mm_adds_epi16(in24, final_rounding);
34015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in25 = _mm_adds_epi16(in25, final_rounding);
34025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in26 = _mm_adds_epi16(in26, final_rounding);
34035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in27 = _mm_adds_epi16(in27, final_rounding);
34045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in28 = _mm_adds_epi16(in28, final_rounding);
34055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in29 = _mm_adds_epi16(in29, final_rounding);
34065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in30 = _mm_adds_epi16(in30, final_rounding);
34075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in31 = _mm_adds_epi16(in31, final_rounding);
34085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
34095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in0 = _mm_srai_epi16(in0, 6);
34105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in1 = _mm_srai_epi16(in1, 6);
34115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in2 = _mm_srai_epi16(in2, 6);
34125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in3 = _mm_srai_epi16(in3, 6);
34135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in4 = _mm_srai_epi16(in4, 6);
34145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in5 = _mm_srai_epi16(in5, 6);
34155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in6 = _mm_srai_epi16(in6, 6);
34165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in7 = _mm_srai_epi16(in7, 6);
34175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in8 = _mm_srai_epi16(in8, 6);
34185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in9 = _mm_srai_epi16(in9, 6);
34195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in10 = _mm_srai_epi16(in10, 6);
34205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in11 = _mm_srai_epi16(in11, 6);
34215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in12 = _mm_srai_epi16(in12, 6);
34225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in13 = _mm_srai_epi16(in13, 6);
34235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in14 = _mm_srai_epi16(in14, 6);
34245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in15 = _mm_srai_epi16(in15, 6);
34255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in16 = _mm_srai_epi16(in16, 6);
34265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in17 = _mm_srai_epi16(in17, 6);
34275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in18 = _mm_srai_epi16(in18, 6);
34285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in19 = _mm_srai_epi16(in19, 6);
34295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in20 = _mm_srai_epi16(in20, 6);
34305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in21 = _mm_srai_epi16(in21, 6);
34315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in22 = _mm_srai_epi16(in22, 6);
34325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in23 = _mm_srai_epi16(in23, 6);
34335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in24 = _mm_srai_epi16(in24, 6);
34345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in25 = _mm_srai_epi16(in25, 6);
34355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in26 = _mm_srai_epi16(in26, 6);
34365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in27 = _mm_srai_epi16(in27, 6);
34375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in28 = _mm_srai_epi16(in28, 6);
34385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in29 = _mm_srai_epi16(in29, 6);
34395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in30 = _mm_srai_epi16(in30, 6);
34405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      in31 = _mm_srai_epi16(in31, 6);
34415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
34425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in0);
34435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in1);
34445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in2);
34455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in3);
34465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in4);
34475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in5);
34485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in6);
34495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in7);
34505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in8);
34515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in9);
34525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in10);
34535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in11);
34545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in12);
34555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in13);
34565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in14);
34575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in15);
34585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in16);
34595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in17);
34605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in18);
34615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in19);
34625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in20);
34635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in21);
34645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in22);
34655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in23);
34665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in24);
34675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in25);
34685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in26);
34695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in27);
34705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in28);
34715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in29);
34725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in30);
34735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      RECON_AND_STORE(dest, in31);
34745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
34755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dest += 8 - (stride * 32);
34765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
34775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
34785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
34795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
34805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
34815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                 int stride) {
3482ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3483ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i final_rounding = _mm_set1_epi16(1<<5);
3484ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3485ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // idct constants for each stage
3486ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3487ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3488ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
3489ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
3490ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
3491ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
3492ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3493ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3494ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3495ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3496ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
3497ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
3498ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
3499ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
3500ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3501ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3502ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3503ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3504ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3505ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
3506ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
3507ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
3508ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
3509ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3510ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3511ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3512ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3513ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3514ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
3515ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
3516ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3517ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3518ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3519ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3520ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3521ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3522ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3523ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3524ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3525ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
3526ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
3527ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3528ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3529ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3530ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3531ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3532ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3533ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
3534ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
3535ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          in24, in25, in26, in27, in28, in29, in30, in31;
3536ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i col[128];
3537ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3538ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3539ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3540ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3541ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_30, stp1_31;
3542ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3543ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3544ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3545ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3546ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp2_30, stp2_31;
3547ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3548f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  int i, j, i32;
3549f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  __m128i zero_idx[16];
3550f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  int zero_flag[2];
3551ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3552ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
3553ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 8; i++) {
3554f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    i32 = (i << 5);
3555ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    if (i < 4) {
3556ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // First 1-D idct
3557ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Load input data.
3558f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in0, input);
3559f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in8, input);
3560f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in16, input);
3561f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in24, input);
3562f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in1, input);
3563f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in9, input);
3564f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in17, input);
3565f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in25, input);
3566f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in2, input);
3567f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in10, input);
3568f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in18, input);
3569f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in26, input);
3570f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in3, input);
3571f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in11, input);
3572f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in19, input);
3573f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in27, input);
3574f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
3575f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in4, input);
3576f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in12, input);
3577f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in20, input);
3578f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in28, input);
3579f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in5, input);
3580f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in13, input);
3581f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in21, input);
3582f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in29, input);
3583f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in6, input);
3584f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in14, input);
3585f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in22, input);
3586f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in30, input);
3587f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in7, input);
3588f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in15, input);
3589f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in23, input);
3590f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      LOAD_DQCOEFF(in31, input);
3591f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
3592f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      // checking if all entries are zero
3593f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[0] = _mm_or_si128(in0, in1);
3594f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[1] = _mm_or_si128(in2, in3);
3595f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[2] = _mm_or_si128(in4, in5);
3596f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[3] = _mm_or_si128(in6, in7);
3597f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[4] = _mm_or_si128(in8, in9);
3598f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[5] = _mm_or_si128(in10, in11);
3599f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[6] = _mm_or_si128(in12, in13);
3600f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[7] = _mm_or_si128(in14, in15);
3601f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[8] = _mm_or_si128(in16, in17);
3602f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[9] = _mm_or_si128(in18, in19);
3603f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[10] = _mm_or_si128(in20, in21);
3604f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[11] = _mm_or_si128(in22, in23);
3605f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[12] = _mm_or_si128(in24, in25);
3606f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[13] = _mm_or_si128(in26, in27);
3607f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[14] = _mm_or_si128(in28, in29);
3608f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[15] = _mm_or_si128(in30, in31);
3609f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
3610f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3611f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3612f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3613f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3614f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3615f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3616f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3617f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
3618f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
3619f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3620f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3621f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3622f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3623f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3624f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3625f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3626f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
3627f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[0] = _mm_unpackhi_epi64(zero_idx[14], zero_idx[14]);
3628f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[1] = _mm_or_si128(zero_idx[0], zero_idx[14]);
3629f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_idx[2] = _mm_srli_epi64(zero_idx[1], 32);
3630f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_flag[0] = _mm_cvtsi128_si32(zero_idx[1]);
3631f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      zero_flag[1] = _mm_cvtsi128_si32(zero_idx[2]);
3632f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
3633f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      if (!zero_flag[0] && !zero_flag[1]) {
3634f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 0] = _mm_setzero_si128();
3635f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 1] = _mm_setzero_si128();
3636f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 2] = _mm_setzero_si128();
3637f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 3] = _mm_setzero_si128();
3638f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 4] = _mm_setzero_si128();
3639f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 5] = _mm_setzero_si128();
3640f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 6] = _mm_setzero_si128();
3641f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 7] = _mm_setzero_si128();
3642f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 8] = _mm_setzero_si128();
3643f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 9] = _mm_setzero_si128();
3644f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 10] = _mm_setzero_si128();
3645f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 11] = _mm_setzero_si128();
3646f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 12] = _mm_setzero_si128();
3647f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 13] = _mm_setzero_si128();
3648f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 14] = _mm_setzero_si128();
3649f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 15] = _mm_setzero_si128();
3650f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 16] = _mm_setzero_si128();
3651f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 17] = _mm_setzero_si128();
3652f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 18] = _mm_setzero_si128();
3653f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 19] = _mm_setzero_si128();
3654f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 20] = _mm_setzero_si128();
3655f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 21] = _mm_setzero_si128();
3656f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 22] = _mm_setzero_si128();
3657f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 23] = _mm_setzero_si128();
3658f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 24] = _mm_setzero_si128();
3659f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 25] = _mm_setzero_si128();
3660f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 26] = _mm_setzero_si128();
3661f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 27] = _mm_setzero_si128();
3662f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 28] = _mm_setzero_si128();
3663f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 29] = _mm_setzero_si128();
3664f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 30] = _mm_setzero_si128();
3665f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        col[i32 + 31] = _mm_setzero_si128();
3666f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang        continue;
3667f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      }
3668ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3669ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Transpose 32x8 block to 8x32 block
3670ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
3671ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in4, in5, in6, in7);
3672ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
3673ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in10, in11, in12, in13, in14, in15);
3674ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
3675ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in18, in19, in20, in21, in22, in23);
3676ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
3677ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in26, in27, in28, in29, in30, in31);
3678ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    } else {
3679ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Second 1-D idct
3680ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      j = i - 4;
3681ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3682ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Transpose 32x8 block to 8x32 block
3683ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3684ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3685ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
3686ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in5, in6, in7);
3687ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      j += 4;
3688ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3689ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3690ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
3691ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in11, in12, in13, in14, in15);
3692ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      j += 4;
3693ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3694ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3695ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
3696ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in19, in20, in21, in22, in23);
3697ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      j += 4;
3698ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3699ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3700ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
3701ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in28, in29, in30, in31);
3702ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    }
3703ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
37045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    IDCT32_1D
3705ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3706ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // final stage
3707ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    if (i < 4) {
3708ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // 1_D: Store 32 intermediate results for each 8x32 block.
3709f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
3710f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
3711f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
3712f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
3713f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
3714f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
3715f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
3716f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
3717f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
3718f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
3719f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
3720f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
3721f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
3722f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
3723f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
3724f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
3725f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
3726f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
3727f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
3728f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
3729f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
3730f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
3731f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
3732f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
3733f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
3734f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
3735f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
3736f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
3737f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
3738f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
3739f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
3740f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
3741ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    } else {
3742ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i zero = _mm_setzero_si128();
3743ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3744ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // 2_D: Calculate the results and store them to destination.
3745ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in0 = _mm_add_epi16(stp1_0, stp1_31);
3746ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in1 = _mm_add_epi16(stp1_1, stp1_30);
3747ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in2 = _mm_add_epi16(stp1_2, stp1_29);
3748ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in3 = _mm_add_epi16(stp1_3, stp1_28);
3749ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in4 = _mm_add_epi16(stp1_4, stp1_27);
3750ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in5 = _mm_add_epi16(stp1_5, stp1_26);
3751ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in6 = _mm_add_epi16(stp1_6, stp1_25);
3752ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in7 = _mm_add_epi16(stp1_7, stp1_24);
3753ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in8 = _mm_add_epi16(stp1_8, stp1_23);
3754ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in9 = _mm_add_epi16(stp1_9, stp1_22);
3755ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in10 = _mm_add_epi16(stp1_10, stp1_21);
3756ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in11 = _mm_add_epi16(stp1_11, stp1_20);
3757ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in12 = _mm_add_epi16(stp1_12, stp1_19);
3758ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in13 = _mm_add_epi16(stp1_13, stp1_18);
3759ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in14 = _mm_add_epi16(stp1_14, stp1_17);
3760ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in15 = _mm_add_epi16(stp1_15, stp1_16);
3761ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in16 = _mm_sub_epi16(stp1_15, stp1_16);
3762ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in17 = _mm_sub_epi16(stp1_14, stp1_17);
3763ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in18 = _mm_sub_epi16(stp1_13, stp1_18);
3764ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in19 = _mm_sub_epi16(stp1_12, stp1_19);
3765ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in20 = _mm_sub_epi16(stp1_11, stp1_20);
3766ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in21 = _mm_sub_epi16(stp1_10, stp1_21);
3767ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in22 = _mm_sub_epi16(stp1_9, stp1_22);
3768ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in23 = _mm_sub_epi16(stp1_8, stp1_23);
3769ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in24 = _mm_sub_epi16(stp1_7, stp1_24);
3770ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in25 = _mm_sub_epi16(stp1_6, stp1_25);
3771ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in26 = _mm_sub_epi16(stp1_5, stp1_26);
3772ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in27 = _mm_sub_epi16(stp1_4, stp1_27);
3773ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in28 = _mm_sub_epi16(stp1_3, stp1_28);
3774ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in29 = _mm_sub_epi16(stp1_2, stp1_29);
3775ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in30 = _mm_sub_epi16(stp1_1, stp1_30);
3776ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in31 = _mm_sub_epi16(stp1_0, stp1_31);
3777ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3778ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Final rounding and shift
3779ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in0 = _mm_adds_epi16(in0, final_rounding);
3780ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in1 = _mm_adds_epi16(in1, final_rounding);
3781ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in2 = _mm_adds_epi16(in2, final_rounding);
3782ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in3 = _mm_adds_epi16(in3, final_rounding);
3783ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in4 = _mm_adds_epi16(in4, final_rounding);
3784ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in5 = _mm_adds_epi16(in5, final_rounding);
3785ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in6 = _mm_adds_epi16(in6, final_rounding);
3786ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in7 = _mm_adds_epi16(in7, final_rounding);
3787ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in8 = _mm_adds_epi16(in8, final_rounding);
3788ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in9 = _mm_adds_epi16(in9, final_rounding);
3789ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in10 = _mm_adds_epi16(in10, final_rounding);
3790ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in11 = _mm_adds_epi16(in11, final_rounding);
3791ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in12 = _mm_adds_epi16(in12, final_rounding);
3792ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in13 = _mm_adds_epi16(in13, final_rounding);
3793ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in14 = _mm_adds_epi16(in14, final_rounding);
3794ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in15 = _mm_adds_epi16(in15, final_rounding);
3795ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in16 = _mm_adds_epi16(in16, final_rounding);
3796ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in17 = _mm_adds_epi16(in17, final_rounding);
3797ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in18 = _mm_adds_epi16(in18, final_rounding);
3798ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in19 = _mm_adds_epi16(in19, final_rounding);
3799ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in20 = _mm_adds_epi16(in20, final_rounding);
3800ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in21 = _mm_adds_epi16(in21, final_rounding);
3801ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in22 = _mm_adds_epi16(in22, final_rounding);
3802ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in23 = _mm_adds_epi16(in23, final_rounding);
3803ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in24 = _mm_adds_epi16(in24, final_rounding);
3804ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in25 = _mm_adds_epi16(in25, final_rounding);
3805ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in26 = _mm_adds_epi16(in26, final_rounding);
3806ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in27 = _mm_adds_epi16(in27, final_rounding);
3807ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in28 = _mm_adds_epi16(in28, final_rounding);
3808ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in29 = _mm_adds_epi16(in29, final_rounding);
3809ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in30 = _mm_adds_epi16(in30, final_rounding);
3810ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in31 = _mm_adds_epi16(in31, final_rounding);
3811ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3812ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in0 = _mm_srai_epi16(in0, 6);
3813ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in1 = _mm_srai_epi16(in1, 6);
3814ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in2 = _mm_srai_epi16(in2, 6);
3815ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in3 = _mm_srai_epi16(in3, 6);
3816ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in4 = _mm_srai_epi16(in4, 6);
3817ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in5 = _mm_srai_epi16(in5, 6);
3818ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in6 = _mm_srai_epi16(in6, 6);
3819ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in7 = _mm_srai_epi16(in7, 6);
3820ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in8 = _mm_srai_epi16(in8, 6);
3821ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in9 = _mm_srai_epi16(in9, 6);
3822ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in10 = _mm_srai_epi16(in10, 6);
3823ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in11 = _mm_srai_epi16(in11, 6);
3824ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in12 = _mm_srai_epi16(in12, 6);
3825ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in13 = _mm_srai_epi16(in13, 6);
3826ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in14 = _mm_srai_epi16(in14, 6);
3827ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in15 = _mm_srai_epi16(in15, 6);
3828ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in16 = _mm_srai_epi16(in16, 6);
3829ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in17 = _mm_srai_epi16(in17, 6);
3830ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in18 = _mm_srai_epi16(in18, 6);
3831ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in19 = _mm_srai_epi16(in19, 6);
3832ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in20 = _mm_srai_epi16(in20, 6);
3833ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in21 = _mm_srai_epi16(in21, 6);
3834ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in22 = _mm_srai_epi16(in22, 6);
3835ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in23 = _mm_srai_epi16(in23, 6);
3836ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in24 = _mm_srai_epi16(in24, 6);
3837ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in25 = _mm_srai_epi16(in25, 6);
3838ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in26 = _mm_srai_epi16(in26, 6);
3839ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in27 = _mm_srai_epi16(in27, 6);
3840ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in28 = _mm_srai_epi16(in28, 6);
3841ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in29 = _mm_srai_epi16(in29, 6);
3842ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in30 = _mm_srai_epi16(in30, 6);
3843ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in31 = _mm_srai_epi16(in31, 6);
3844ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3845ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in0);
3846ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in1);
3847ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in2);
3848ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in3);
3849ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in4);
3850ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in5);
3851ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in6);
3852ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in7);
3853ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in8);
3854ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in9);
3855ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in10);
3856ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in11);
3857ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in12);
3858ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in13);
3859ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in14);
3860ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in15);
3861ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in16);
3862ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in17);
3863ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in18);
3864ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in19);
3865ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in20);
3866ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in21);
3867ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in22);
3868ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in23);
3869ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in24);
3870ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in25);
3871ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in26);
3872ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in27);
3873ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in28);
3874ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in29);
3875ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in30);
3876ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in31);
3877ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3878ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      dest += 8 - (stride * 32);
3879ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    }
3880ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
38815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}  //NOLINT
38825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
38835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
38845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  __m128i dc_value;
38855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const __m128i zero = _mm_setzero_si128();
38865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int a, i;
38875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
38885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  a = dct_const_round_shift(input[0] * cospi_16_64);
38895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  a = dct_const_round_shift(a * cospi_16_64);
38905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  a = ROUND_POWER_OF_TWO(a, 6);
38915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
38925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  dc_value = _mm_set1_epi16(a);
38935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
38945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (i = 0; i < 4; ++i) {
38955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
38965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
38975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
38985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
38995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    RECON_AND_STORE(dest, dc_value);
39275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dest += 8 - (stride * 32);
39285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
3929ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
3930