vp9_idct_intrin_sse2.c revision 91037db265ecdd914a26e056cf69207b4f50924e
1ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang/*
2ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *
4ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  Use of this source code is governed by a BSD-style license
5ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  that can be found in the LICENSE file in the root of the source
6ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  tree. An additional intellectual property rights grant can be found
7ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  in the file PATENTS.  All contributing project authors may
8ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  be found in the AUTHORS file in the root of the source tree.
9ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang */
10ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
11ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include <assert.h>
12ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include <emmintrin.h>  // SSE2
13ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "./vpx_config.h"
14ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vpx/vpx_integer.h"
15ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp9/common/vp9_common.h"
16ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp9/common/vp9_idct.h"
17ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
18ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangvoid vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) {
19ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i zero = _mm_setzero_si128();
20ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i eight = _mm_set1_epi16(8);
21ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
22ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                    (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
23ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                    (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
24ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                    (int16_t)cospi_8_64, (int16_t)cospi_24_64);
25ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
26ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i input0, input1, input2, input3;
27ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
28ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Rows
29ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_loadl_epi64((__m128i *)input);
30ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_loadl_epi64((__m128i *)(input + 4));
31ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_loadl_epi64((__m128i *)(input + 8));
32ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_loadl_epi64((__m128i *)(input + 12));
33ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
34ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Construct i3, i1, i3, i1, i2, i0, i2, i0
35ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_shufflelo_epi16(input0, 0xd8);
36ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_shufflelo_epi16(input1, 0xd8);
37ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_shufflelo_epi16(input2, 0xd8);
38ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_shufflelo_epi16(input3, 0xd8);
39ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
40ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_unpacklo_epi32(input0, input0);
41ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_unpacklo_epi32(input1, input1);
42ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_unpacklo_epi32(input2, input2);
43ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_unpacklo_epi32(input3, input3);
44ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
45ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 1
46ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_madd_epi16(input0, cst);
47ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_madd_epi16(input1, cst);
48ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_madd_epi16(input2, cst);
49ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_madd_epi16(input3, cst);
50ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
51ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_add_epi32(input0, rounding);
52ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_add_epi32(input1, rounding);
53ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_add_epi32(input2, rounding);
54ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_add_epi32(input3, rounding);
55ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
56ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
57ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
58ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
59ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
60ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
61ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 2
62ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_packs_epi32(input0, zero);
63ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_packs_epi32(input1, zero);
64ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_packs_epi32(input2, zero);
65ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_packs_epi32(input3, zero);
66ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
67ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Transpose
68ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_unpacklo_epi16(input0, input1);
69ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_unpacklo_epi16(input2, input3);
70ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_unpacklo_epi32(input1, input3);
71ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_unpackhi_epi32(input1, input3);
72ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
73ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Switch column2, column 3, and then, we got:
74ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // input2: column1, column 0;  input3: column2, column 3.
75ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_shuffle_epi32(input1, 0x4e);
76ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_add_epi16(input0, input1);
77ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_sub_epi16(input0, input1);
78ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
79ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Columns
80ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Construct i3, i1, i3, i1, i2, i0, i2, i0
81ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_shufflelo_epi16(input2, 0xd8);
82ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_shufflehi_epi16(input2, 0xd8);
83ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_shufflehi_epi16(input3, 0xd8);
84ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_shufflelo_epi16(input3, 0xd8);
85ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
86ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_unpacklo_epi32(input0, input0);
87ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_unpackhi_epi32(input1, input1);
88ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_unpackhi_epi32(input2, input2);
89ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_unpacklo_epi32(input3, input3);
90ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
91ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 1
92ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_madd_epi16(input0, cst);
93ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_madd_epi16(input1, cst);
94ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_madd_epi16(input2, cst);
95ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_madd_epi16(input3, cst);
96ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
97ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_add_epi32(input0, rounding);
98ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_add_epi32(input1, rounding);
99ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_add_epi32(input2, rounding);
100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_add_epi32(input3, rounding);
101ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
102ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
103ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
105ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
106ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 2
108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_packs_epi32(input0, zero);
109ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_packs_epi32(input1, zero);
110ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_packs_epi32(input2, zero);
111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_packs_epi32(input3, zero);
112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
113ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Transpose
114ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_unpacklo_epi16(input0, input1);
115ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_unpacklo_epi16(input2, input3);
116ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_unpacklo_epi32(input1, input3);
117ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_unpackhi_epi32(input1, input3);
118ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
119ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Switch column2, column 3, and then, we got:
120ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // input2: column1, column 0;  input3: column2, column 3.
121ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_shuffle_epi32(input1, 0x4e);
122ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_add_epi16(input0, input1);
123ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_sub_epi16(input0, input1);
124ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
125ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Final round and shift
126ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_add_epi16(input2, eight);
127ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_add_epi16(input3, eight);
128ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
129ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input2 = _mm_srai_epi16(input2, 4);
130ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input3 = _mm_srai_epi16(input3, 4);
131ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
132ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define RECON_AND_STORE4X4(dest, in_x) \
133ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {                                                     \
134ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
135ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      d0 = _mm_unpacklo_epi8(d0, zero); \
136ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      d0 = _mm_add_epi16(in_x, d0); \
137ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      d0 = _mm_packus_epi16(d0, d0); \
138ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      *(int *)dest = _mm_cvtsi128_si32(d0); \
139ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      dest += stride; \
140ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
141ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
142ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input0 = _mm_srli_si128(input2, 8);
143ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input1 = _mm_srli_si128(input3, 8);
144ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
145ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE4X4(dest, input2);
146ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE4X4(dest, input0);
147ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE4X4(dest, input1);
148ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE4X4(dest, input3);
149ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
150ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
15191037db265ecdd914a26e056cf69207b4f50924ehkuangvoid vp9_short_idct4x4_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
15291037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i dc_value;
15391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i zero = _mm_setzero_si128();
15491037db265ecdd914a26e056cf69207b4f50924ehkuang  int a;
15591037db265ecdd914a26e056cf69207b4f50924ehkuang
15691037db265ecdd914a26e056cf69207b4f50924ehkuang  a = dct_const_round_shift(input[0] * cospi_16_64);
15791037db265ecdd914a26e056cf69207b4f50924ehkuang  a = dct_const_round_shift(a * cospi_16_64);
15891037db265ecdd914a26e056cf69207b4f50924ehkuang  a = ROUND_POWER_OF_TWO(a, 4);
15991037db265ecdd914a26e056cf69207b4f50924ehkuang
16091037db265ecdd914a26e056cf69207b4f50924ehkuang  dc_value = _mm_set1_epi16(a);
16191037db265ecdd914a26e056cf69207b4f50924ehkuang
16291037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, dc_value);
16391037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, dc_value);
16491037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, dc_value);
16591037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, dc_value);
16691037db265ecdd914a26e056cf69207b4f50924ehkuang}
16791037db265ecdd914a26e056cf69207b4f50924ehkuang
168ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangvoid vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {
169ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i zero = _mm_setzero_si128();
170ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i c1 = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
171ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                    (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
172ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                    (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
173ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                    (int16_t)cospi_8_64, (int16_t)cospi_24_64);
174ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i c2 = _mm_setr_epi16(1, 1, 1, 1, 1, -1, 1, -1);
175ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
176ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
177ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i in, temp;
178ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
179ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Load input data.
180ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in = _mm_loadl_epi64((__m128i *)input);
181ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
182ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Construct i3, i1, i3, i1, i2, i0, i2, i0
183ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in = _mm_shufflelo_epi16(in, 0xd8);
184ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in = _mm_unpacklo_epi32(in, in);
185ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
186ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 1
187ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in = _mm_madd_epi16(in, c1);
188ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in = _mm_add_epi32(in, rounding);
189ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in = _mm_srai_epi32(in, DCT_CONST_BITS);
190ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in = _mm_packs_epi32(in, zero);
191ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
192ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 2
193ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp = _mm_shufflelo_epi16(in, 0x9c);
194ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in = _mm_shufflelo_epi16(in, 0xc9);
195ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in = _mm_unpacklo_epi64(temp, in);
196ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in = _mm_madd_epi16(in, c2);
197ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in = _mm_packs_epi32(in, zero);
198ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
199ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Store results
200ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  _mm_storel_epi64((__m128i *)output, in);
201ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
202ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
20391037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void transpose_4x4(__m128i *res) {
20491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
20591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]);
20691037db265ecdd914a26e056cf69207b4f50924ehkuang  res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
20791037db265ecdd914a26e056cf69207b4f50924ehkuang  res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
20891037db265ecdd914a26e056cf69207b4f50924ehkuang
20991037db265ecdd914a26e056cf69207b4f50924ehkuang  res[1] = _mm_unpackhi_epi64(res[0], res[0]);
21091037db265ecdd914a26e056cf69207b4f50924ehkuang  res[3] = _mm_unpackhi_epi64(res[2], res[2]);
21191037db265ecdd914a26e056cf69207b4f50924ehkuang}
21291037db265ecdd914a26e056cf69207b4f50924ehkuang
21391037db265ecdd914a26e056cf69207b4f50924ehkuangvoid idct4_1d_sse2(__m128i *in) {
21491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
21591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
21691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
21791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
21891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
21991037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i u[8], v[8];
22091037db265ecdd914a26e056cf69207b4f50924ehkuang
22191037db265ecdd914a26e056cf69207b4f50924ehkuang  transpose_4x4(in);
22291037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 1
22391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(in[0], in[2]);
22491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpacklo_epi16(in[1], in[3]);
22591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
22691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
22791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
22891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
22991037db265ecdd914a26e056cf69207b4f50924ehkuang
23091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
23191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
23291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
23391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
23491037db265ecdd914a26e056cf69207b4f50924ehkuang
23591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
23691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
23791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
23891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
23991037db265ecdd914a26e056cf69207b4f50924ehkuang
24091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_packs_epi32(v[0], v[2]);
24191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_packs_epi32(v[1], v[3]);
24291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpackhi_epi64(u[0], u[0]);
24391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi64(u[1], u[1]);
24491037db265ecdd914a26e056cf69207b4f50924ehkuang
24591037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 2
24691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_add_epi16(u[0], u[3]);
24791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_add_epi16(u[1], u[2]);
24891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_sub_epi16(u[1], u[2]);
24991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_sub_epi16(u[0], u[3]);
25091037db265ecdd914a26e056cf69207b4f50924ehkuang}
25191037db265ecdd914a26e056cf69207b4f50924ehkuang
25291037db265ecdd914a26e056cf69207b4f50924ehkuangvoid iadst4_1d_sse2(__m128i *in) {
25391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
25491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
25591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
25691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
25791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
25891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i kZero = _mm_set1_epi16(0);
25991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
26091037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i u[8], v[8], in7;
26191037db265ecdd914a26e056cf69207b4f50924ehkuang
26291037db265ecdd914a26e056cf69207b4f50924ehkuang  transpose_4x4(in);
26391037db265ecdd914a26e056cf69207b4f50924ehkuang  in7 = _mm_add_epi16(in[0], in[3]);
26491037db265ecdd914a26e056cf69207b4f50924ehkuang  in7 = _mm_sub_epi16(in7, in[2]);
26591037db265ecdd914a26e056cf69207b4f50924ehkuang
26691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(in[0], in[2]);
26791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpacklo_epi16(in[1], in[3]);
26891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(in7, kZero);
26991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpacklo_epi16(in[1], kZero);
27091037db265ecdd914a26e056cf69207b4f50924ehkuang
27191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
27291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
27391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
27491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
27591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
27691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
27791037db265ecdd914a26e056cf69207b4f50924ehkuang
27891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], v[1]);
27991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[3], v[4]);
28091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = v[2];
28191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(u[0], u[1]);
28291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_slli_epi32(v[5], 2);
28391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(u[3], v[5]);
28491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_sub_epi32(u[5], u[4]);
28591037db265ecdd914a26e056cf69207b4f50924ehkuang
28691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
28791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
28891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
28991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
29091037db265ecdd914a26e056cf69207b4f50924ehkuang
29191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
29291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
29391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
29491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
29591037db265ecdd914a26e056cf69207b4f50924ehkuang
29691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_packs_epi32(u[0], u[2]);
29791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_packs_epi32(u[1], u[3]);
29891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_unpackhi_epi64(in[0], in[0]);
29991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_unpackhi_epi64(in[1], in[1]);
30091037db265ecdd914a26e056cf69207b4f50924ehkuang}
30191037db265ecdd914a26e056cf69207b4f50924ehkuang
30291037db265ecdd914a26e056cf69207b4f50924ehkuangvoid vp9_short_iht4x4_add_sse2(int16_t *input, uint8_t *dest, int stride,
30391037db265ecdd914a26e056cf69207b4f50924ehkuang                               int tx_type) {
30491037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i in[4];
30591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i zero = _mm_setzero_si128();
30691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i eight = _mm_set1_epi16(8);
30791037db265ecdd914a26e056cf69207b4f50924ehkuang
30891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_loadl_epi64((__m128i *)input);
30991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_loadl_epi64((__m128i *)(input + 4));
31091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_loadl_epi64((__m128i *)(input + 8));
31191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_loadl_epi64((__m128i *)(input + 12));
31291037db265ecdd914a26e056cf69207b4f50924ehkuang
31391037db265ecdd914a26e056cf69207b4f50924ehkuang  switch (tx_type) {
31491037db265ecdd914a26e056cf69207b4f50924ehkuang    case 0:  // DCT_DCT
31591037db265ecdd914a26e056cf69207b4f50924ehkuang      idct4_1d_sse2(in);
31691037db265ecdd914a26e056cf69207b4f50924ehkuang      idct4_1d_sse2(in);
31791037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
31891037db265ecdd914a26e056cf69207b4f50924ehkuang    case 1:  // ADST_DCT
31991037db265ecdd914a26e056cf69207b4f50924ehkuang      idct4_1d_sse2(in);
32091037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst4_1d_sse2(in);
32191037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
32291037db265ecdd914a26e056cf69207b4f50924ehkuang    case 2:  // DCT_ADST
32391037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst4_1d_sse2(in);
32491037db265ecdd914a26e056cf69207b4f50924ehkuang      idct4_1d_sse2(in);
32591037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
32691037db265ecdd914a26e056cf69207b4f50924ehkuang    case 3:  // ADST_ADST
32791037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst4_1d_sse2(in);
32891037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst4_1d_sse2(in);
32991037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
33091037db265ecdd914a26e056cf69207b4f50924ehkuang    default:
33191037db265ecdd914a26e056cf69207b4f50924ehkuang      assert(0);
33291037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
33391037db265ecdd914a26e056cf69207b4f50924ehkuang  }
33491037db265ecdd914a26e056cf69207b4f50924ehkuang
33591037db265ecdd914a26e056cf69207b4f50924ehkuang  // Final round and shift
33691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_add_epi16(in[0], eight);
33791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_add_epi16(in[1], eight);
33891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_add_epi16(in[2], eight);
33991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_add_epi16(in[3], eight);
34091037db265ecdd914a26e056cf69207b4f50924ehkuang
34191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_srai_epi16(in[0], 4);
34291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_srai_epi16(in[1], 4);
34391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_srai_epi16(in[2], 4);
34491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_srai_epi16(in[3], 4);
34591037db265ecdd914a26e056cf69207b4f50924ehkuang
34691037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, in[0]);
34791037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, in[1]);
34891037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, in[2]);
34991037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE4X4(dest, in[3]);
35091037db265ecdd914a26e056cf69207b4f50924ehkuang}
35191037db265ecdd914a26e056cf69207b4f50924ehkuang
352ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
353ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                      out0, out1, out2, out3, out4, out5, out6, out7) \
354ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {                                                     \
355ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
356ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
357ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
358ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
359ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
360ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
361ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
362ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
363ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                                        \
364ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
365ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
366ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
367ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
368ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
369ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
370ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
371ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
372ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                                            \
373ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
374ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
375ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
376ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
377ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
378ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
379ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
380ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
381ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
382ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
383ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, \
384ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                      out0, out1, out2, out3, out4, out5, out6, out7) \
385ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {                                                     \
386ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
387ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
388ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
389ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
390ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                                        \
391ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
392ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
393ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
394ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
395ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                                            \
396ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
397ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
398ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
399ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
400ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out4 = out5 = out6 = out7 = zero; \
401ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
402ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
403ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1, out2, out3) \
404ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {                                                     \
405ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
406ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
407ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
408ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
409ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                                        \
410ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);  /* i1 i0 */  \
411ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);  /* i3 i2 */  \
412ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in2 = _mm_unpacklo_epi32(tr0_2, tr0_3);  /* i5 i4 */  \
413ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in3 = _mm_unpackhi_epi32(tr0_2, tr0_3);  /* i7 i6 */  \
414ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
415ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
416ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang// Define Macro for multiplying elements by constants and adding them together.
417ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
418ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                               cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
419ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {   \
420ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp0 = _mm_madd_epi16(lo_0, cst0); \
421ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp1 = _mm_madd_epi16(hi_0, cst0); \
422ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp2 = _mm_madd_epi16(lo_0, cst1); \
423ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp3 = _mm_madd_epi16(hi_0, cst1); \
424ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp4 = _mm_madd_epi16(lo_1, cst2); \
425ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp5 = _mm_madd_epi16(hi_1, cst2); \
426ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp6 = _mm_madd_epi16(lo_1, cst3); \
427ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp7 = _mm_madd_epi16(hi_1, cst3); \
428ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      \
429ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp0 = _mm_add_epi32(tmp0, rounding); \
430ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp1 = _mm_add_epi32(tmp1, rounding); \
431ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp2 = _mm_add_epi32(tmp2, rounding); \
432ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp3 = _mm_add_epi32(tmp3, rounding); \
433ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp4 = _mm_add_epi32(tmp4, rounding); \
434ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp5 = _mm_add_epi32(tmp5, rounding); \
435ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp6 = _mm_add_epi32(tmp6, rounding); \
436ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp7 = _mm_add_epi32(tmp7, rounding); \
437ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      \
438ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
439ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
440ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
441ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
442ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
443ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
444ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
445ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
446ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      \
447ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      res0 = _mm_packs_epi32(tmp0, tmp1); \
448ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      res1 = _mm_packs_epi32(tmp2, tmp3); \
449ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      res2 = _mm_packs_epi32(tmp4, tmp5); \
450ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      res3 = _mm_packs_epi32(tmp6, tmp7); \
451ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
452ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
453ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define IDCT8x8_1D  \
454ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage1 */      \
455ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
456ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
457ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
458ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
459ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
460ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
461ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
462ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                          stg1_1, stg1_2, stg1_3, stp1_4,      \
463ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                          stp1_7, stp1_5, stp1_6)              \
464ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  } \
465ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
466ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage2 */ \
467ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
468ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
469ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
470ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
471ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
472ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
473ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
474ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stg2_1, stg2_2, stg2_3, stp2_0,     \
475ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stp2_1, stp2_2, stp2_3)             \
476ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
477ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
478ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
479ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
480ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
481ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  } \
482ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
483ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage3 */ \
484ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
485ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
486ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
487ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
488ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
489ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
490ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
491ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
492ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
493ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
494ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
495ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
496ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
497ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
498ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding); \
499ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_add_epi32(tmp1, rounding); \
500ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding); \
501ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_add_epi32(tmp3, rounding); \
502ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
503ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
504ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
505ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
506ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
507ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
508ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
509ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
510ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  } \
511ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  \
512ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage4  */ \
513ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in0 = _mm_adds_epi16(stp1_0, stp2_7); \
514ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in1 = _mm_adds_epi16(stp1_1, stp1_6); \
515ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in2 = _mm_adds_epi16(stp1_2, stp1_5); \
516ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in3 = _mm_adds_epi16(stp1_3, stp2_4); \
517ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in4 = _mm_subs_epi16(stp1_3, stp2_4); \
518ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in5 = _mm_subs_epi16(stp1_2, stp1_5); \
519ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in6 = _mm_subs_epi16(stp1_1, stp1_6); \
520ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in7 = _mm_subs_epi16(stp1_0, stp2_7);
521ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
522ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define RECON_AND_STORE(dest, in_x) \
523ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {                                                     \
524ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
525ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      d0 = _mm_unpacklo_epi8(d0, zero); \
526ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in_x = _mm_add_epi16(in_x, d0); \
527ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in_x = _mm_packus_epi16(in_x, in_x); \
528ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      _mm_storel_epi64((__m128i *)(dest), in_x); \
529ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      dest += stride; \
530ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
531ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
532ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangvoid vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
533ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i zero = _mm_setzero_si128();
534ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
535ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i final_rounding = _mm_set1_epi16(1<<4);
536ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
537ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
538ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
539ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
540ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
541ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
542ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
543ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
544ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
545ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
546ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
547ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
548ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
549ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int i;
550ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
551ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Load input data.
552ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in0 = _mm_load_si128((__m128i *)input);
553ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in1 = _mm_load_si128((__m128i *)(input + 8 * 1));
554ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in2 = _mm_load_si128((__m128i *)(input + 8 * 2));
555ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in3 = _mm_load_si128((__m128i *)(input + 8 * 3));
556ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in4 = _mm_load_si128((__m128i *)(input + 8 * 4));
557ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in5 = _mm_load_si128((__m128i *)(input + 8 * 5));
558ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in6 = _mm_load_si128((__m128i *)(input + 8 * 6));
559ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in7 = _mm_load_si128((__m128i *)(input + 8 * 7));
560ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
561ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // 2-D
562ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 2; i++) {
563ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // 8x8 Transpose is copied from vp9_short_fdct8x8_sse2()
564ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
565ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                  in4, in5, in6, in7);
566ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
567ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // 4-stage 1D idct8x8
568ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    IDCT8x8_1D
569ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
570ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
571ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Final rounding and shift
572ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in0 = _mm_adds_epi16(in0, final_rounding);
573ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in1 = _mm_adds_epi16(in1, final_rounding);
574ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in2 = _mm_adds_epi16(in2, final_rounding);
575ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in3 = _mm_adds_epi16(in3, final_rounding);
576ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in4 = _mm_adds_epi16(in4, final_rounding);
577ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in5 = _mm_adds_epi16(in5, final_rounding);
578ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in6 = _mm_adds_epi16(in6, final_rounding);
579ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in7 = _mm_adds_epi16(in7, final_rounding);
580ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
581ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in0 = _mm_srai_epi16(in0, 5);
582ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in1 = _mm_srai_epi16(in1, 5);
583ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in2 = _mm_srai_epi16(in2, 5);
584ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in3 = _mm_srai_epi16(in3, 5);
585ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in4 = _mm_srai_epi16(in4, 5);
586ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in5 = _mm_srai_epi16(in5, 5);
587ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in6 = _mm_srai_epi16(in6, 5);
588ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in7 = _mm_srai_epi16(in7, 5);
589ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
590ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in0);
591ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in1);
592ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in2);
593ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in3);
594ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in4);
595ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in5);
596ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in6);
597ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in7);
598ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
599ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
60091037db265ecdd914a26e056cf69207b4f50924ehkuang// perform 8x8 transpose
60191037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
60291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
60391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
60491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
60591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
60691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
60791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
60891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
60991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
61091037db265ecdd914a26e056cf69207b4f50924ehkuang
61191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
61291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
61391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
61491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
61591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
61691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
61791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
61891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
61991037db265ecdd914a26e056cf69207b4f50924ehkuang
62091037db265ecdd914a26e056cf69207b4f50924ehkuang  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
62191037db265ecdd914a26e056cf69207b4f50924ehkuang  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
62291037db265ecdd914a26e056cf69207b4f50924ehkuang  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
62391037db265ecdd914a26e056cf69207b4f50924ehkuang  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
62491037db265ecdd914a26e056cf69207b4f50924ehkuang  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
62591037db265ecdd914a26e056cf69207b4f50924ehkuang  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
62691037db265ecdd914a26e056cf69207b4f50924ehkuang  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
62791037db265ecdd914a26e056cf69207b4f50924ehkuang  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
62891037db265ecdd914a26e056cf69207b4f50924ehkuang}
62991037db265ecdd914a26e056cf69207b4f50924ehkuang
63091037db265ecdd914a26e056cf69207b4f50924ehkuangvoid idct8_1d_sse2(__m128i *in) {
63191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
63291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
63391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
63491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
63591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
63691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
63791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
63891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
63991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
64091037db265ecdd914a26e056cf69207b4f50924ehkuang
64191037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
64291037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
64391037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
64491037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
64591037db265ecdd914a26e056cf69207b4f50924ehkuang
64691037db265ecdd914a26e056cf69207b4f50924ehkuang  in0 = in[0];
64791037db265ecdd914a26e056cf69207b4f50924ehkuang  in1 = in[1];
64891037db265ecdd914a26e056cf69207b4f50924ehkuang  in2 = in[2];
64991037db265ecdd914a26e056cf69207b4f50924ehkuang  in3 = in[3];
65091037db265ecdd914a26e056cf69207b4f50924ehkuang  in4 = in[4];
65191037db265ecdd914a26e056cf69207b4f50924ehkuang  in5 = in[5];
65291037db265ecdd914a26e056cf69207b4f50924ehkuang  in6 = in[6];
65391037db265ecdd914a26e056cf69207b4f50924ehkuang  in7 = in[7];
65491037db265ecdd914a26e056cf69207b4f50924ehkuang
65591037db265ecdd914a26e056cf69207b4f50924ehkuang  // 8x8 Transpose is copied from vp9_short_fdct8x8_sse2()
65691037db265ecdd914a26e056cf69207b4f50924ehkuang  TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
65791037db265ecdd914a26e056cf69207b4f50924ehkuang                in4, in5, in6, in7);
65891037db265ecdd914a26e056cf69207b4f50924ehkuang
65991037db265ecdd914a26e056cf69207b4f50924ehkuang  // 4-stage 1D idct8x8
66091037db265ecdd914a26e056cf69207b4f50924ehkuang  IDCT8x8_1D
66191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = in0;
66291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = in1;
66391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = in2;
66491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = in3;
66591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = in4;
66691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = in5;
66791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = in6;
66891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = in7;
66991037db265ecdd914a26e056cf69207b4f50924ehkuang}
67091037db265ecdd914a26e056cf69207b4f50924ehkuang
67191037db265ecdd914a26e056cf69207b4f50924ehkuangvoid iadst8_1d_sse2(__m128i *in) {
67291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
67391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
67491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
67591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
67691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
67791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
67891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
67991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
68091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
68191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
68291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
68391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
68491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
68591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__const_0 = _mm_set1_epi16(0);
68691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
68791037db265ecdd914a26e056cf69207b4f50924ehkuang
68891037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
68991037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
69091037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
69191037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
69291037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
69391037db265ecdd914a26e056cf69207b4f50924ehkuang
69491037db265ecdd914a26e056cf69207b4f50924ehkuang  // transpose
69591037db265ecdd914a26e056cf69207b4f50924ehkuang  array_transpose_8x8(in, in);
69691037db265ecdd914a26e056cf69207b4f50924ehkuang
69791037db265ecdd914a26e056cf69207b4f50924ehkuang  // properly aligned for butterfly input
69891037db265ecdd914a26e056cf69207b4f50924ehkuang  in0  = in[7];
69991037db265ecdd914a26e056cf69207b4f50924ehkuang  in1  = in[0];
70091037db265ecdd914a26e056cf69207b4f50924ehkuang  in2  = in[5];
70191037db265ecdd914a26e056cf69207b4f50924ehkuang  in3  = in[2];
70291037db265ecdd914a26e056cf69207b4f50924ehkuang  in4  = in[3];
70391037db265ecdd914a26e056cf69207b4f50924ehkuang  in5  = in[4];
70491037db265ecdd914a26e056cf69207b4f50924ehkuang  in6  = in[1];
70591037db265ecdd914a26e056cf69207b4f50924ehkuang  in7  = in[6];
70691037db265ecdd914a26e056cf69207b4f50924ehkuang
70791037db265ecdd914a26e056cf69207b4f50924ehkuang  // column transformation
70891037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 1
70991037db265ecdd914a26e056cf69207b4f50924ehkuang  // interleave and multiply/add into 32-bit integer
71091037db265ecdd914a26e056cf69207b4f50924ehkuang  s0 = _mm_unpacklo_epi16(in0, in1);
71191037db265ecdd914a26e056cf69207b4f50924ehkuang  s1 = _mm_unpackhi_epi16(in0, in1);
71291037db265ecdd914a26e056cf69207b4f50924ehkuang  s2 = _mm_unpacklo_epi16(in2, in3);
71391037db265ecdd914a26e056cf69207b4f50924ehkuang  s3 = _mm_unpackhi_epi16(in2, in3);
71491037db265ecdd914a26e056cf69207b4f50924ehkuang  s4 = _mm_unpacklo_epi16(in4, in5);
71591037db265ecdd914a26e056cf69207b4f50924ehkuang  s5 = _mm_unpackhi_epi16(in4, in5);
71691037db265ecdd914a26e056cf69207b4f50924ehkuang  s6 = _mm_unpacklo_epi16(in6, in7);
71791037db265ecdd914a26e056cf69207b4f50924ehkuang  s7 = _mm_unpackhi_epi16(in6, in7);
71891037db265ecdd914a26e056cf69207b4f50924ehkuang
71991037db265ecdd914a26e056cf69207b4f50924ehkuang  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
72091037db265ecdd914a26e056cf69207b4f50924ehkuang  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
72191037db265ecdd914a26e056cf69207b4f50924ehkuang  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
72291037db265ecdd914a26e056cf69207b4f50924ehkuang  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
72391037db265ecdd914a26e056cf69207b4f50924ehkuang  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
72491037db265ecdd914a26e056cf69207b4f50924ehkuang  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
72591037db265ecdd914a26e056cf69207b4f50924ehkuang  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
72691037db265ecdd914a26e056cf69207b4f50924ehkuang  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
72791037db265ecdd914a26e056cf69207b4f50924ehkuang  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
72891037db265ecdd914a26e056cf69207b4f50924ehkuang  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
72991037db265ecdd914a26e056cf69207b4f50924ehkuang  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
73091037db265ecdd914a26e056cf69207b4f50924ehkuang  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
73191037db265ecdd914a26e056cf69207b4f50924ehkuang  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
73291037db265ecdd914a26e056cf69207b4f50924ehkuang  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
73391037db265ecdd914a26e056cf69207b4f50924ehkuang  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
73491037db265ecdd914a26e056cf69207b4f50924ehkuang  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
73591037db265ecdd914a26e056cf69207b4f50924ehkuang
73691037db265ecdd914a26e056cf69207b4f50924ehkuang  // addition
73791037db265ecdd914a26e056cf69207b4f50924ehkuang  w0 = _mm_add_epi32(u0, u8);
73891037db265ecdd914a26e056cf69207b4f50924ehkuang  w1 = _mm_add_epi32(u1, u9);
73991037db265ecdd914a26e056cf69207b4f50924ehkuang  w2 = _mm_add_epi32(u2, u10);
74091037db265ecdd914a26e056cf69207b4f50924ehkuang  w3 = _mm_add_epi32(u3, u11);
74191037db265ecdd914a26e056cf69207b4f50924ehkuang  w4 = _mm_add_epi32(u4, u12);
74291037db265ecdd914a26e056cf69207b4f50924ehkuang  w5 = _mm_add_epi32(u5, u13);
74391037db265ecdd914a26e056cf69207b4f50924ehkuang  w6 = _mm_add_epi32(u6, u14);
74491037db265ecdd914a26e056cf69207b4f50924ehkuang  w7 = _mm_add_epi32(u7, u15);
74591037db265ecdd914a26e056cf69207b4f50924ehkuang  w8 = _mm_sub_epi32(u0, u8);
74691037db265ecdd914a26e056cf69207b4f50924ehkuang  w9 = _mm_sub_epi32(u1, u9);
74791037db265ecdd914a26e056cf69207b4f50924ehkuang  w10 = _mm_sub_epi32(u2, u10);
74891037db265ecdd914a26e056cf69207b4f50924ehkuang  w11 = _mm_sub_epi32(u3, u11);
74991037db265ecdd914a26e056cf69207b4f50924ehkuang  w12 = _mm_sub_epi32(u4, u12);
75091037db265ecdd914a26e056cf69207b4f50924ehkuang  w13 = _mm_sub_epi32(u5, u13);
75191037db265ecdd914a26e056cf69207b4f50924ehkuang  w14 = _mm_sub_epi32(u6, u14);
75291037db265ecdd914a26e056cf69207b4f50924ehkuang  w15 = _mm_sub_epi32(u7, u15);
75391037db265ecdd914a26e056cf69207b4f50924ehkuang
75491037db265ecdd914a26e056cf69207b4f50924ehkuang  // shift and rounding
75591037db265ecdd914a26e056cf69207b4f50924ehkuang  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
75691037db265ecdd914a26e056cf69207b4f50924ehkuang  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
75791037db265ecdd914a26e056cf69207b4f50924ehkuang  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
75891037db265ecdd914a26e056cf69207b4f50924ehkuang  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
75991037db265ecdd914a26e056cf69207b4f50924ehkuang  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
76091037db265ecdd914a26e056cf69207b4f50924ehkuang  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
76191037db265ecdd914a26e056cf69207b4f50924ehkuang  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
76291037db265ecdd914a26e056cf69207b4f50924ehkuang  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
76391037db265ecdd914a26e056cf69207b4f50924ehkuang  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
76491037db265ecdd914a26e056cf69207b4f50924ehkuang  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
76591037db265ecdd914a26e056cf69207b4f50924ehkuang  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
76691037db265ecdd914a26e056cf69207b4f50924ehkuang  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
76791037db265ecdd914a26e056cf69207b4f50924ehkuang  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
76891037db265ecdd914a26e056cf69207b4f50924ehkuang  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
76991037db265ecdd914a26e056cf69207b4f50924ehkuang  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
77091037db265ecdd914a26e056cf69207b4f50924ehkuang  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
77191037db265ecdd914a26e056cf69207b4f50924ehkuang
77291037db265ecdd914a26e056cf69207b4f50924ehkuang  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
77391037db265ecdd914a26e056cf69207b4f50924ehkuang  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
77491037db265ecdd914a26e056cf69207b4f50924ehkuang  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
77591037db265ecdd914a26e056cf69207b4f50924ehkuang  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
77691037db265ecdd914a26e056cf69207b4f50924ehkuang  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
77791037db265ecdd914a26e056cf69207b4f50924ehkuang  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
77891037db265ecdd914a26e056cf69207b4f50924ehkuang  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
77991037db265ecdd914a26e056cf69207b4f50924ehkuang  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
78091037db265ecdd914a26e056cf69207b4f50924ehkuang  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
78191037db265ecdd914a26e056cf69207b4f50924ehkuang  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
78291037db265ecdd914a26e056cf69207b4f50924ehkuang  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
78391037db265ecdd914a26e056cf69207b4f50924ehkuang  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
78491037db265ecdd914a26e056cf69207b4f50924ehkuang  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
78591037db265ecdd914a26e056cf69207b4f50924ehkuang  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
78691037db265ecdd914a26e056cf69207b4f50924ehkuang  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
78791037db265ecdd914a26e056cf69207b4f50924ehkuang  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
78891037db265ecdd914a26e056cf69207b4f50924ehkuang
78991037db265ecdd914a26e056cf69207b4f50924ehkuang  // back to 16-bit and pack 8 integers into __m128i
79091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_packs_epi32(u0, u1);
79191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_packs_epi32(u2, u3);
79291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_packs_epi32(u4, u5);
79391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_packs_epi32(u6, u7);
79491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = _mm_packs_epi32(u8, u9);
79591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_packs_epi32(u10, u11);
79691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = _mm_packs_epi32(u12, u13);
79791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_packs_epi32(u14, u15);
79891037db265ecdd914a26e056cf69207b4f50924ehkuang
79991037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 2
80091037db265ecdd914a26e056cf69207b4f50924ehkuang  s0 = _mm_add_epi16(in[0], in[2]);
80191037db265ecdd914a26e056cf69207b4f50924ehkuang  s1 = _mm_add_epi16(in[1], in[3]);
80291037db265ecdd914a26e056cf69207b4f50924ehkuang  s2 = _mm_sub_epi16(in[0], in[2]);
80391037db265ecdd914a26e056cf69207b4f50924ehkuang  s3 = _mm_sub_epi16(in[1], in[3]);
80491037db265ecdd914a26e056cf69207b4f50924ehkuang  u0 = _mm_unpacklo_epi16(in[4], in[5]);
80591037db265ecdd914a26e056cf69207b4f50924ehkuang  u1 = _mm_unpackhi_epi16(in[4], in[5]);
80691037db265ecdd914a26e056cf69207b4f50924ehkuang  u2 = _mm_unpacklo_epi16(in[6], in[7]);
80791037db265ecdd914a26e056cf69207b4f50924ehkuang  u3 = _mm_unpackhi_epi16(in[6], in[7]);
80891037db265ecdd914a26e056cf69207b4f50924ehkuang
80991037db265ecdd914a26e056cf69207b4f50924ehkuang  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
81091037db265ecdd914a26e056cf69207b4f50924ehkuang  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
81191037db265ecdd914a26e056cf69207b4f50924ehkuang  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
81291037db265ecdd914a26e056cf69207b4f50924ehkuang  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
81391037db265ecdd914a26e056cf69207b4f50924ehkuang  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
81491037db265ecdd914a26e056cf69207b4f50924ehkuang  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
81591037db265ecdd914a26e056cf69207b4f50924ehkuang  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
81691037db265ecdd914a26e056cf69207b4f50924ehkuang  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
81791037db265ecdd914a26e056cf69207b4f50924ehkuang
81891037db265ecdd914a26e056cf69207b4f50924ehkuang  w0 = _mm_add_epi32(v0, v4);
81991037db265ecdd914a26e056cf69207b4f50924ehkuang  w1 = _mm_add_epi32(v1, v5);
82091037db265ecdd914a26e056cf69207b4f50924ehkuang  w2 = _mm_add_epi32(v2, v6);
82191037db265ecdd914a26e056cf69207b4f50924ehkuang  w3 = _mm_add_epi32(v3, v7);
82291037db265ecdd914a26e056cf69207b4f50924ehkuang  w4 = _mm_sub_epi32(v0, v4);
82391037db265ecdd914a26e056cf69207b4f50924ehkuang  w5 = _mm_sub_epi32(v1, v5);
82491037db265ecdd914a26e056cf69207b4f50924ehkuang  w6 = _mm_sub_epi32(v2, v6);
82591037db265ecdd914a26e056cf69207b4f50924ehkuang  w7 = _mm_sub_epi32(v3, v7);
82691037db265ecdd914a26e056cf69207b4f50924ehkuang
82791037db265ecdd914a26e056cf69207b4f50924ehkuang  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
82891037db265ecdd914a26e056cf69207b4f50924ehkuang  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
82991037db265ecdd914a26e056cf69207b4f50924ehkuang  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
83091037db265ecdd914a26e056cf69207b4f50924ehkuang  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
83191037db265ecdd914a26e056cf69207b4f50924ehkuang  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
83291037db265ecdd914a26e056cf69207b4f50924ehkuang  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
83391037db265ecdd914a26e056cf69207b4f50924ehkuang  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
83491037db265ecdd914a26e056cf69207b4f50924ehkuang  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
83591037db265ecdd914a26e056cf69207b4f50924ehkuang
83691037db265ecdd914a26e056cf69207b4f50924ehkuang  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
83791037db265ecdd914a26e056cf69207b4f50924ehkuang  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
83891037db265ecdd914a26e056cf69207b4f50924ehkuang  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
83991037db265ecdd914a26e056cf69207b4f50924ehkuang  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
84091037db265ecdd914a26e056cf69207b4f50924ehkuang  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
84191037db265ecdd914a26e056cf69207b4f50924ehkuang  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
84291037db265ecdd914a26e056cf69207b4f50924ehkuang  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
84391037db265ecdd914a26e056cf69207b4f50924ehkuang  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
84491037db265ecdd914a26e056cf69207b4f50924ehkuang
84591037db265ecdd914a26e056cf69207b4f50924ehkuang  // back to 16-bit intergers
84691037db265ecdd914a26e056cf69207b4f50924ehkuang  s4 = _mm_packs_epi32(u0, u1);
84791037db265ecdd914a26e056cf69207b4f50924ehkuang  s5 = _mm_packs_epi32(u2, u3);
84891037db265ecdd914a26e056cf69207b4f50924ehkuang  s6 = _mm_packs_epi32(u4, u5);
84991037db265ecdd914a26e056cf69207b4f50924ehkuang  s7 = _mm_packs_epi32(u6, u7);
85091037db265ecdd914a26e056cf69207b4f50924ehkuang
85191037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 3
85291037db265ecdd914a26e056cf69207b4f50924ehkuang  u0 = _mm_unpacklo_epi16(s2, s3);
85391037db265ecdd914a26e056cf69207b4f50924ehkuang  u1 = _mm_unpackhi_epi16(s2, s3);
85491037db265ecdd914a26e056cf69207b4f50924ehkuang  u2 = _mm_unpacklo_epi16(s6, s7);
85591037db265ecdd914a26e056cf69207b4f50924ehkuang  u3 = _mm_unpackhi_epi16(s6, s7);
85691037db265ecdd914a26e056cf69207b4f50924ehkuang
85791037db265ecdd914a26e056cf69207b4f50924ehkuang  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
85891037db265ecdd914a26e056cf69207b4f50924ehkuang  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
85991037db265ecdd914a26e056cf69207b4f50924ehkuang  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
86091037db265ecdd914a26e056cf69207b4f50924ehkuang  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
86191037db265ecdd914a26e056cf69207b4f50924ehkuang  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
86291037db265ecdd914a26e056cf69207b4f50924ehkuang  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
86391037db265ecdd914a26e056cf69207b4f50924ehkuang  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
86491037db265ecdd914a26e056cf69207b4f50924ehkuang  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
86591037db265ecdd914a26e056cf69207b4f50924ehkuang
86691037db265ecdd914a26e056cf69207b4f50924ehkuang  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
86791037db265ecdd914a26e056cf69207b4f50924ehkuang  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
86891037db265ecdd914a26e056cf69207b4f50924ehkuang  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
86991037db265ecdd914a26e056cf69207b4f50924ehkuang  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
87091037db265ecdd914a26e056cf69207b4f50924ehkuang  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
87191037db265ecdd914a26e056cf69207b4f50924ehkuang  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
87291037db265ecdd914a26e056cf69207b4f50924ehkuang  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
87391037db265ecdd914a26e056cf69207b4f50924ehkuang  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
87491037db265ecdd914a26e056cf69207b4f50924ehkuang
87591037db265ecdd914a26e056cf69207b4f50924ehkuang  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
87691037db265ecdd914a26e056cf69207b4f50924ehkuang  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
87791037db265ecdd914a26e056cf69207b4f50924ehkuang  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
87891037db265ecdd914a26e056cf69207b4f50924ehkuang  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
87991037db265ecdd914a26e056cf69207b4f50924ehkuang  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
88091037db265ecdd914a26e056cf69207b4f50924ehkuang  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
88191037db265ecdd914a26e056cf69207b4f50924ehkuang  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
88291037db265ecdd914a26e056cf69207b4f50924ehkuang  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
88391037db265ecdd914a26e056cf69207b4f50924ehkuang
88491037db265ecdd914a26e056cf69207b4f50924ehkuang  s2 = _mm_packs_epi32(v0, v1);
88591037db265ecdd914a26e056cf69207b4f50924ehkuang  s3 = _mm_packs_epi32(v2, v3);
88691037db265ecdd914a26e056cf69207b4f50924ehkuang  s6 = _mm_packs_epi32(v4, v5);
88791037db265ecdd914a26e056cf69207b4f50924ehkuang  s7 = _mm_packs_epi32(v6, v7);
88891037db265ecdd914a26e056cf69207b4f50924ehkuang
88991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = s0;
89091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_sub_epi16(k__const_0, s4);
89191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = s6;
89291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_sub_epi16(k__const_0, s2);
89391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = s3;
89491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_sub_epi16(k__const_0, s7);
89591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = s5;
89691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_sub_epi16(k__const_0, s1);
89791037db265ecdd914a26e056cf69207b4f50924ehkuang}
89891037db265ecdd914a26e056cf69207b4f50924ehkuang
89991037db265ecdd914a26e056cf69207b4f50924ehkuang
90091037db265ecdd914a26e056cf69207b4f50924ehkuangvoid vp9_short_iht8x8_add_sse2(int16_t *input, uint8_t *dest, int stride,
90191037db265ecdd914a26e056cf69207b4f50924ehkuang                               int tx_type) {
90291037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i in[8];
90391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i zero = _mm_setzero_si128();
90491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i final_rounding = _mm_set1_epi16(1<<4);
90591037db265ecdd914a26e056cf69207b4f50924ehkuang
90691037db265ecdd914a26e056cf69207b4f50924ehkuang  // load input data
90791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_load_si128((__m128i *)input);
90891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_load_si128((__m128i *)(input + 8 * 1));
90991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_load_si128((__m128i *)(input + 8 * 2));
91091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_load_si128((__m128i *)(input + 8 * 3));
91191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = _mm_load_si128((__m128i *)(input + 8 * 4));
91291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_load_si128((__m128i *)(input + 8 * 5));
91391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = _mm_load_si128((__m128i *)(input + 8 * 6));
91491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_load_si128((__m128i *)(input + 8 * 7));
91591037db265ecdd914a26e056cf69207b4f50924ehkuang
91691037db265ecdd914a26e056cf69207b4f50924ehkuang  switch (tx_type) {
91791037db265ecdd914a26e056cf69207b4f50924ehkuang    case 0:  // DCT_DCT
91891037db265ecdd914a26e056cf69207b4f50924ehkuang      idct8_1d_sse2(in);
91991037db265ecdd914a26e056cf69207b4f50924ehkuang      idct8_1d_sse2(in);
92091037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
92191037db265ecdd914a26e056cf69207b4f50924ehkuang    case 1:  // ADST_DCT
92291037db265ecdd914a26e056cf69207b4f50924ehkuang      idct8_1d_sse2(in);
92391037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst8_1d_sse2(in);
92491037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
92591037db265ecdd914a26e056cf69207b4f50924ehkuang    case 2:  // DCT_ADST
92691037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst8_1d_sse2(in);
92791037db265ecdd914a26e056cf69207b4f50924ehkuang      idct8_1d_sse2(in);
92891037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
92991037db265ecdd914a26e056cf69207b4f50924ehkuang    case 3:  // ADST_ADST
93091037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst8_1d_sse2(in);
93191037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst8_1d_sse2(in);
93291037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
93391037db265ecdd914a26e056cf69207b4f50924ehkuang    default:
93491037db265ecdd914a26e056cf69207b4f50924ehkuang      assert(0);
93591037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
93691037db265ecdd914a26e056cf69207b4f50924ehkuang  }
93791037db265ecdd914a26e056cf69207b4f50924ehkuang
93891037db265ecdd914a26e056cf69207b4f50924ehkuang  // Final rounding and shift
93991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_adds_epi16(in[0], final_rounding);
94091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_adds_epi16(in[1], final_rounding);
94191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_adds_epi16(in[2], final_rounding);
94291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_adds_epi16(in[3], final_rounding);
94391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = _mm_adds_epi16(in[4], final_rounding);
94491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_adds_epi16(in[5], final_rounding);
94591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = _mm_adds_epi16(in[6], final_rounding);
94691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_adds_epi16(in[7], final_rounding);
94791037db265ecdd914a26e056cf69207b4f50924ehkuang
94891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_srai_epi16(in[0], 5);
94991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_srai_epi16(in[1], 5);
95091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_srai_epi16(in[2], 5);
95191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_srai_epi16(in[3], 5);
95291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = _mm_srai_epi16(in[4], 5);
95391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_srai_epi16(in[5], 5);
95491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = _mm_srai_epi16(in[6], 5);
95591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_srai_epi16(in[7], 5);
95691037db265ecdd914a26e056cf69207b4f50924ehkuang
95791037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[0]);
95891037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[1]);
95991037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[2]);
96091037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[3]);
96191037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[4]);
96291037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[5]);
96391037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[6]);
96491037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[7]);
96591037db265ecdd914a26e056cf69207b4f50924ehkuang}
96691037db265ecdd914a26e056cf69207b4f50924ehkuang
967ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangvoid vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
968ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i zero = _mm_setzero_si128();
969ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
970ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i final_rounding = _mm_set1_epi16(1<<4);
971ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
972ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
973ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
974ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
975ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
976ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
977ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
978ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
979ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
980ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
981ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
982ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
983ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
984ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
985ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
986ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Rows. Load 4-row input data.
987ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in0 = _mm_load_si128((__m128i *)input);
988ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in1 = _mm_load_si128((__m128i *)(input + 8 * 1));
989ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in2 = _mm_load_si128((__m128i *)(input + 8 * 2));
990ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in3 = _mm_load_si128((__m128i *)(input + 8 * 3));
991ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
992ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // 8x4 Transpose
993ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3)
994ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
995ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage1
996ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {
997ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3);
998ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2);
999ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1000ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_17, stg1_0);
1001ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_17, stg1_1);
1002ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_madd_epi16(lo_35, stg1_2);
1003ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_madd_epi16(lo_35, stg1_3);
1004ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1005ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding);
1006ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding);
1007ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_add_epi32(tmp4, rounding);
1008ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_add_epi32(tmp6, rounding);
1009ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1010ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1011ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
1012ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
1013ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1014ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_4 = _mm_packs_epi32(tmp0, zero);
1015ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_7 = _mm_packs_epi32(tmp2, zero);
1016ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_5 = _mm_packs_epi32(tmp4, zero);
1017ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_6 = _mm_packs_epi32(tmp6, zero);
1018ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
1019ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1020ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage2
1021ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {
1022ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2);
1023ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3);
1024ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1025ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_04, stg2_0);
1026ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_04, stg2_1);
1027ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_madd_epi16(lo_26, stg2_2);
1028ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_madd_epi16(lo_26, stg2_3);
1029ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1030ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding);
1031ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding);
1032ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_add_epi32(tmp4, rounding);
1033ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_add_epi32(tmp6, rounding);
1034ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1035ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1036ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
1037ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
1038ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1039ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_0 = _mm_packs_epi32(tmp0, zero);
1040ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_1 = _mm_packs_epi32(tmp2, zero);
1041ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_2 = _mm_packs_epi32(tmp4, zero);
1042ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_3 = _mm_packs_epi32(tmp6, zero);
1043ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1044ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5);
1045ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5);
1046ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6);
1047ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6);
1048ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
1049ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1050ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage3
1051ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {
1052ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
1053ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3);
1054ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2);
1055ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2);
1056ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3);
1057ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1058ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_56, stg3_0);
1059ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
1060ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1061ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding);
1062ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding);
1063ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1064ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1065ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1066ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_5 = _mm_packs_epi32(tmp0, zero);
1067ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_6 = _mm_packs_epi32(tmp2, zero);
1068ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
1069ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1070ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage4
1071ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in0 = _mm_adds_epi16(stp1_0, stp2_7);
1072ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in1 = _mm_adds_epi16(stp1_1, stp1_6);
1073ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in2 = _mm_adds_epi16(stp1_2, stp1_5);
1074ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in3 = _mm_adds_epi16(stp1_3, stp2_4);
1075ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in4 = _mm_subs_epi16(stp1_3, stp2_4);
1076ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in5 = _mm_subs_epi16(stp1_2, stp1_5);
1077ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in6 = _mm_subs_epi16(stp1_1, stp1_6);
1078ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in7 = _mm_subs_epi16(stp1_0, stp2_7);
1079ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1080ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Columns. 4x8 Transpose
1081ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
1082ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                in4, in5, in6, in7)
1083ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1084ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // 1D idct8x8
1085ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  IDCT8x8_1D
1086ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1087ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Final rounding and shift
1088ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in0 = _mm_adds_epi16(in0, final_rounding);
1089ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in1 = _mm_adds_epi16(in1, final_rounding);
1090ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in2 = _mm_adds_epi16(in2, final_rounding);
1091ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in3 = _mm_adds_epi16(in3, final_rounding);
1092ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in4 = _mm_adds_epi16(in4, final_rounding);
1093ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in5 = _mm_adds_epi16(in5, final_rounding);
1094ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in6 = _mm_adds_epi16(in6, final_rounding);
1095ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in7 = _mm_adds_epi16(in7, final_rounding);
1096ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1097ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in0 = _mm_srai_epi16(in0, 5);
1098ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in1 = _mm_srai_epi16(in1, 5);
1099ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in2 = _mm_srai_epi16(in2, 5);
1100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in3 = _mm_srai_epi16(in3, 5);
1101ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in4 = _mm_srai_epi16(in4, 5);
1102ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in5 = _mm_srai_epi16(in5, 5);
1103ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in6 = _mm_srai_epi16(in6, 5);
1104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in7 = _mm_srai_epi16(in7, 5);
1105ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1106ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in0);
1107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in1);
1108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in2);
1109ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in3);
1110ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in4);
1111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in5);
1112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in6);
1113ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  RECON_AND_STORE(dest, in7);
1114ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
1115ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1116ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define IDCT16x16_1D \
1117ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage2 */ \
1118ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
1119ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \
1120ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \
1121ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7);   \
1122ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_9_7 = _mm_unpackhi_epi16(in9, in7);   \
1123ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_5_11 = _mm_unpacklo_epi16(in5, in11); \
1124ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \
1125ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \
1126ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \
1127ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1128ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
1129ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stg2_0, stg2_1, stg2_2, stg2_3, \
1130ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stp2_8, stp2_15, stp2_9, stp2_14) \
1131ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1132ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
1133ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stg2_4, stg2_5, stg2_6, stg2_7, \
1134ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stp2_10, stp2_13, stp2_11, stp2_12) \
1135ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  } \
1136ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1137ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage3 */ \
1138ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
1139ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \
1140ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \
1141ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \
1142ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \
1143ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1144ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
1145ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stg3_0, stg3_1, stg3_2, stg3_3, \
1146ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stp1_4, stp1_7, stp1_5, stp1_6) \
1147ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1148ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);  \
1149ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);    \
1150ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
1151ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
1152ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1153ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
1154ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
1155ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
1156ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
1157ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  } \
1158ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  \
1159ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage4 */ \
1160ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
1161ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \
1162ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \
1163ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \
1164ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \
1165ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1166ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
1167ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
1168ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1169ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1170ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1171ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
1172ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stg4_0, stg4_1, stg4_2, stg4_3, \
1173ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stp2_0, stp2_1, stp2_2, stp2_3) \
1174ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1175ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
1176ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
1177ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
1178ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
1179ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1180ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
1181ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stg4_4, stg4_5, stg4_6, stg4_7, \
1182ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stp2_9, stp2_14, stp2_10, stp2_13) \
1183ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  } \
1184ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1185ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage5 */ \
1186ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
1187ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
1188ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
1189ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1190ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
1191ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
1192ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
1193ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
1194ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1195ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
1196ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
1197ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
1198ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
1199ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1200ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding); \
1201ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_add_epi32(tmp1, rounding); \
1202ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding); \
1203ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_add_epi32(tmp3, rounding); \
1204ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1205ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
1206ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
1207ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
1208ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
1209ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1210ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
1211ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
1212ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1213ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
1214ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
1215ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
1216ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
1217ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1218ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
1219ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
1220ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
1221ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
1222ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  } \
1223ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1224ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* Stage6 */ \
1225ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  { \
1226ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1227ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1228ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
1229ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
1230ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1231ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
1232ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
1233ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
1234ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
1235ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
1236ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1237ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1238ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1239ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    \
1240ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
1241ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stg6_0, stg4_0, stg6_0, stg4_0, \
1242ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                           stp2_10, stp2_13, stp2_11, stp2_12) \
1243ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
1244ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1245ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangvoid vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
1246ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
1247ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i final_rounding = _mm_set1_epi16(1<<5);
1248ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i zero = _mm_setzero_si128();
1249ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1250ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1251ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
1252ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1253ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
1254ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1255ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
1256ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1257ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
1258ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1259ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1260ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
1261ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1262ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
1263ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1264ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
1265ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1266ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1267ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
1268ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1269ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
1270ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1271ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1272ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1273ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1274ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1275ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero,
1276ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero,
1277ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          in10 = zero, in11 = zero, in12 = zero, in13 = zero,
1278ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          in14 = zero, in15 = zero;
1279ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero,
1280ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero,
1281ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          l12 = zero, l13 = zero, l14 = zero, l15 = zero;
1282ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i r0 = zero, r1 = zero, r2 = zero, r3 = zero, r4 = zero, r5 = zero,
1283ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          r6 = zero, r7 = zero, r8 = zero, r9 = zero, r10 = zero, r11 = zero,
1284ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          r12 = zero, r13 = zero, r14 = zero, r15 = zero;
1285ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
1286ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
1287ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_8_0, stp1_12_0;
1288ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
1289ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
1290ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1291ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int i;
1292ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1293ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // We work on a 8x16 block each time, and loop 4 times for 2-D 16x16 idct.
1294ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 4; i++) {
1295ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // 1-D idct
1296ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    if (i < 2) {
1297ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      if (i == 1) input += 128;
1298ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1299ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Load input data.
1300ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in0 = _mm_load_si128((__m128i *)input);
1301ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
1302ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in1 = _mm_load_si128((__m128i *)(input + 8 * 2));
1303ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in9 = _mm_load_si128((__m128i *)(input + 8 * 3));
1304ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in2 = _mm_load_si128((__m128i *)(input + 8 * 4));
1305ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in10 = _mm_load_si128((__m128i *)(input + 8 * 5));
1306ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in3 = _mm_load_si128((__m128i *)(input + 8 * 6));
1307ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in11 = _mm_load_si128((__m128i *)(input + 8 * 7));
1308ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in4 = _mm_load_si128((__m128i *)(input + 8 * 8));
1309ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in12 = _mm_load_si128((__m128i *)(input + 8 * 9));
1310ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in5 = _mm_load_si128((__m128i *)(input + 8 * 10));
1311ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in13 = _mm_load_si128((__m128i *)(input + 8 * 11));
1312ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in6 = _mm_load_si128((__m128i *)(input + 8 * 12));
1313ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in14 = _mm_load_si128((__m128i *)(input + 8 * 13));
1314ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in7 = _mm_load_si128((__m128i *)(input + 8 * 14));
1315ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in15 = _mm_load_si128((__m128i *)(input + 8 * 15));
1316ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1317ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
1318ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in4, in5, in6, in7);
1319ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
1320ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in10, in11, in12, in13, in14, in15);
1321ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    }
1322ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1323ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    if (i == 2) {
1324ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4,
1325ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in5, in6, in7);
1326ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(r0, r1, r2, r3, r4, r5, r6, r7, in8, in9, in10, in11, in12,
1327ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in13, in14, in15);
1328ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    }
1329ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1330ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    if (i == 3) {
1331ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3,
1332ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in4, in5, in6, in7);
1333ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(r8, r9, r10, r11, r12, r13, r14, r15, in8, in9, in10, in11,
1334ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in12, in13, in14, in15);
1335ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    }
1336ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1337ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    IDCT16x16_1D
1338ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1339ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // Stage7
1340ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    if (i == 0) {
1341ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Left 8x16
1342ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l0 = _mm_add_epi16(stp2_0, stp1_15);
1343ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l1 = _mm_add_epi16(stp2_1, stp1_14);
1344ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l2 = _mm_add_epi16(stp2_2, stp2_13);
1345ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l3 = _mm_add_epi16(stp2_3, stp2_12);
1346ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l4 = _mm_add_epi16(stp2_4, stp2_11);
1347ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l5 = _mm_add_epi16(stp2_5, stp2_10);
1348ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l6 = _mm_add_epi16(stp2_6, stp1_9);
1349ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l7 = _mm_add_epi16(stp2_7, stp1_8);
1350ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l8 = _mm_sub_epi16(stp2_7, stp1_8);
1351ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l9 = _mm_sub_epi16(stp2_6, stp1_9);
1352ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l10 = _mm_sub_epi16(stp2_5, stp2_10);
1353ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l11 = _mm_sub_epi16(stp2_4, stp2_11);
1354ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l12 = _mm_sub_epi16(stp2_3, stp2_12);
1355ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l13 = _mm_sub_epi16(stp2_2, stp2_13);
1356ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l14 = _mm_sub_epi16(stp2_1, stp1_14);
1357ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      l15 = _mm_sub_epi16(stp2_0, stp1_15);
1358ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    } else if (i == 1) {
1359ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Right 8x16
1360ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r0 = _mm_add_epi16(stp2_0, stp1_15);
1361ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r1 = _mm_add_epi16(stp2_1, stp1_14);
1362ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r2 = _mm_add_epi16(stp2_2, stp2_13);
1363ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r3 = _mm_add_epi16(stp2_3, stp2_12);
1364ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r4 = _mm_add_epi16(stp2_4, stp2_11);
1365ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r5 = _mm_add_epi16(stp2_5, stp2_10);
1366ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r6 = _mm_add_epi16(stp2_6, stp1_9);
1367ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r7 = _mm_add_epi16(stp2_7, stp1_8);
1368ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r8 = _mm_sub_epi16(stp2_7, stp1_8);
1369ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r9 = _mm_sub_epi16(stp2_6, stp1_9);
1370ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r10 = _mm_sub_epi16(stp2_5, stp2_10);
1371ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r11 = _mm_sub_epi16(stp2_4, stp2_11);
1372ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r12 = _mm_sub_epi16(stp2_3, stp2_12);
1373ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r13 = _mm_sub_epi16(stp2_2, stp2_13);
1374ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r14 = _mm_sub_epi16(stp2_1, stp1_14);
1375ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      r15 = _mm_sub_epi16(stp2_0, stp1_15);
1376ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    } else {
1377ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // 2-D
1378ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in0 = _mm_add_epi16(stp2_0, stp1_15);
1379ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in1 = _mm_add_epi16(stp2_1, stp1_14);
1380ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in2 = _mm_add_epi16(stp2_2, stp2_13);
1381ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in3 = _mm_add_epi16(stp2_3, stp2_12);
1382ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in4 = _mm_add_epi16(stp2_4, stp2_11);
1383ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in5 = _mm_add_epi16(stp2_5, stp2_10);
1384ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in6 = _mm_add_epi16(stp2_6, stp1_9);
1385ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in7 = _mm_add_epi16(stp2_7, stp1_8);
1386ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in8 = _mm_sub_epi16(stp2_7, stp1_8);
1387ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in9 = _mm_sub_epi16(stp2_6, stp1_9);
1388ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in10 = _mm_sub_epi16(stp2_5, stp2_10);
1389ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in11 = _mm_sub_epi16(stp2_4, stp2_11);
1390ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in12 = _mm_sub_epi16(stp2_3, stp2_12);
1391ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in13 = _mm_sub_epi16(stp2_2, stp2_13);
1392ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in14 = _mm_sub_epi16(stp2_1, stp1_14);
1393ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in15 = _mm_sub_epi16(stp2_0, stp1_15);
1394ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1395ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Final rounding and shift
1396ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in0 = _mm_adds_epi16(in0, final_rounding);
1397ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in1 = _mm_adds_epi16(in1, final_rounding);
1398ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in2 = _mm_adds_epi16(in2, final_rounding);
1399ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in3 = _mm_adds_epi16(in3, final_rounding);
1400ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in4 = _mm_adds_epi16(in4, final_rounding);
1401ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in5 = _mm_adds_epi16(in5, final_rounding);
1402ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in6 = _mm_adds_epi16(in6, final_rounding);
1403ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in7 = _mm_adds_epi16(in7, final_rounding);
1404ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in8 = _mm_adds_epi16(in8, final_rounding);
1405ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in9 = _mm_adds_epi16(in9, final_rounding);
1406ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in10 = _mm_adds_epi16(in10, final_rounding);
1407ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in11 = _mm_adds_epi16(in11, final_rounding);
1408ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in12 = _mm_adds_epi16(in12, final_rounding);
1409ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in13 = _mm_adds_epi16(in13, final_rounding);
1410ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in14 = _mm_adds_epi16(in14, final_rounding);
1411ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in15 = _mm_adds_epi16(in15, final_rounding);
1412ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1413ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in0 = _mm_srai_epi16(in0, 6);
1414ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in1 = _mm_srai_epi16(in1, 6);
1415ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in2 = _mm_srai_epi16(in2, 6);
1416ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in3 = _mm_srai_epi16(in3, 6);
1417ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in4 = _mm_srai_epi16(in4, 6);
1418ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in5 = _mm_srai_epi16(in5, 6);
1419ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in6 = _mm_srai_epi16(in6, 6);
1420ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in7 = _mm_srai_epi16(in7, 6);
1421ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in8 = _mm_srai_epi16(in8, 6);
1422ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in9 = _mm_srai_epi16(in9, 6);
1423ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in10 = _mm_srai_epi16(in10, 6);
1424ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in11 = _mm_srai_epi16(in11, 6);
1425ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in12 = _mm_srai_epi16(in12, 6);
1426ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in13 = _mm_srai_epi16(in13, 6);
1427ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in14 = _mm_srai_epi16(in14, 6);
1428ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in15 = _mm_srai_epi16(in15, 6);
1429ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1430ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in0);
1431ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in1);
1432ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in2);
1433ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in3);
1434ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in4);
1435ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in5);
1436ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in6);
1437ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in7);
1438ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in8);
1439ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in9);
1440ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in10);
1441ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in11);
1442ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in12);
1443ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in13);
1444ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in14);
1445ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in15);
1446ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1447ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      dest += 8 - (stride * 16);
1448ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    }
1449ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
1450ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
1451ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
145291037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
145391037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i tbuf[8];
145491037db265ecdd914a26e056cf69207b4f50924ehkuang  array_transpose_8x8(res0, res0);
145591037db265ecdd914a26e056cf69207b4f50924ehkuang  array_transpose_8x8(res1, tbuf);
145691037db265ecdd914a26e056cf69207b4f50924ehkuang  array_transpose_8x8(res0 + 8, res1);
145791037db265ecdd914a26e056cf69207b4f50924ehkuang  array_transpose_8x8(res1 + 8, res1 + 8);
145891037db265ecdd914a26e056cf69207b4f50924ehkuang
145991037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[8] = tbuf[0];
146091037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[9] = tbuf[1];
146191037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[10] = tbuf[2];
146291037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[11] = tbuf[3];
146391037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[12] = tbuf[4];
146491037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[13] = tbuf[5];
146591037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[14] = tbuf[6];
146691037db265ecdd914a26e056cf69207b4f50924ehkuang  res0[15] = tbuf[7];
146791037db265ecdd914a26e056cf69207b4f50924ehkuang}
146891037db265ecdd914a26e056cf69207b4f50924ehkuang
146991037db265ecdd914a26e056cf69207b4f50924ehkuangvoid iadst16_1d_8col(__m128i *in) {
147091037db265ecdd914a26e056cf69207b4f50924ehkuang  // perform 16x16 1-D ADST for 8 columns
147191037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i s[16], x[16], u[32], v[32];
147291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
147391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
147491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
147591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
147691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
147791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
147891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
147991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
148091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
148191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
148291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
148391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
148491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
148591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
148691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
148791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
148891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
148991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
149091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
149191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
149291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
149391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
149491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
149591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
149691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
149791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
149891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
149991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
150091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
150191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
150291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i kZero = _mm_set1_epi16(0);
150391037db265ecdd914a26e056cf69207b4f50924ehkuang
150491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
150591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
150691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
150791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
150891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
150991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
151091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
151191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
151291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
151391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
151491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
151591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
151691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
151791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
151891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
151991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
152091037db265ecdd914a26e056cf69207b4f50924ehkuang
152191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
152291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
152391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
152491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
152591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
152691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
152791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
152891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
152991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
153091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
153191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
153291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
153391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
153491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
153591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
153691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
153791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
153891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
153991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
154091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
154191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
154291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
154391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
154491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
154591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
154691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
154791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
154891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
154991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
155091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
155191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
155291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
155391037db265ecdd914a26e056cf69207b4f50924ehkuang
155491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], v[16]);
155591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], v[17]);
155691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], v[18]);
155791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], v[19]);
155891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(v[4], v[20]);
155991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(v[5], v[21]);
156091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(v[6], v[22]);
156191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(v[7], v[23]);
156291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_add_epi32(v[8], v[24]);
156391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_add_epi32(v[9], v[25]);
156491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_add_epi32(v[10], v[26]);
156591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_add_epi32(v[11], v[27]);
156691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_add_epi32(v[12], v[28]);
156791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_add_epi32(v[13], v[29]);
156891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_add_epi32(v[14], v[30]);
156991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_add_epi32(v[15], v[31]);
157091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[16] = _mm_sub_epi32(v[0], v[16]);
157191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[17] = _mm_sub_epi32(v[1], v[17]);
157291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[18] = _mm_sub_epi32(v[2], v[18]);
157391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[19] = _mm_sub_epi32(v[3], v[19]);
157491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[20] = _mm_sub_epi32(v[4], v[20]);
157591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[21] = _mm_sub_epi32(v[5], v[21]);
157691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[22] = _mm_sub_epi32(v[6], v[22]);
157791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[23] = _mm_sub_epi32(v[7], v[23]);
157891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[24] = _mm_sub_epi32(v[8], v[24]);
157991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[25] = _mm_sub_epi32(v[9], v[25]);
158091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[26] = _mm_sub_epi32(v[10], v[26]);
158191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[27] = _mm_sub_epi32(v[11], v[27]);
158291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[28] = _mm_sub_epi32(v[12], v[28]);
158391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[29] = _mm_sub_epi32(v[13], v[29]);
158491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[30] = _mm_sub_epi32(v[14], v[30]);
158591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[31] = _mm_sub_epi32(v[15], v[31]);
158691037db265ecdd914a26e056cf69207b4f50924ehkuang
158791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
158891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
158991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
159091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
159191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
159291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
159391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
159491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
159591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
159691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
159791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
159891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
159991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
160091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
160191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
160291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
160391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
160491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
160591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
160691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
160791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
160891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
160991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
161091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
161191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
161291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
161391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
161491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
161591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
161691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
161791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
161891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
161991037db265ecdd914a26e056cf69207b4f50924ehkuang
162091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
162191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
162291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
162391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
162491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
162591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
162691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
162791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
162891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
162991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
163091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
163191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
163291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
163391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
163491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
163591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
163691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
163791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
163891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
163991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
164091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
164191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
164291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
164391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
164491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
164591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
164691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
164791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
164891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
164991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
165091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
165191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
165291037db265ecdd914a26e056cf69207b4f50924ehkuang
165391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[0] = _mm_packs_epi32(u[0], u[1]);
165491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[1] = _mm_packs_epi32(u[2], u[3]);
165591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[2] = _mm_packs_epi32(u[4], u[5]);
165691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[3] = _mm_packs_epi32(u[6], u[7]);
165791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[4] = _mm_packs_epi32(u[8], u[9]);
165891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[5] = _mm_packs_epi32(u[10], u[11]);
165991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[6] = _mm_packs_epi32(u[12], u[13]);
166091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[7] = _mm_packs_epi32(u[14], u[15]);
166191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[8] = _mm_packs_epi32(u[16], u[17]);
166291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[9] = _mm_packs_epi32(u[18], u[19]);
166391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[10] = _mm_packs_epi32(u[20], u[21]);
166491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[11] = _mm_packs_epi32(u[22], u[23]);
166591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[12] = _mm_packs_epi32(u[24], u[25]);
166691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[13] = _mm_packs_epi32(u[26], u[27]);
166791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[14] = _mm_packs_epi32(u[28], u[29]);
166891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[15] = _mm_packs_epi32(u[30], u[31]);
166991037db265ecdd914a26e056cf69207b4f50924ehkuang
167091037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 2
167191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
167291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
167391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
167491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
167591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
167691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
167791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
167891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
167991037db265ecdd914a26e056cf69207b4f50924ehkuang
168091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
168191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
168291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
168391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
168491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
168591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
168691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
168791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
168891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
168991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
169091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
169191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
169291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
169391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
169491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
169591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
169691037db265ecdd914a26e056cf69207b4f50924ehkuang
169791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], v[8]);
169891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], v[9]);
169991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], v[10]);
170091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], v[11]);
170191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(v[4], v[12]);
170291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(v[5], v[13]);
170391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(v[6], v[14]);
170491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(v[7], v[15]);
170591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_sub_epi32(v[0], v[8]);
170691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_sub_epi32(v[1], v[9]);
170791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_sub_epi32(v[2], v[10]);
170891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_sub_epi32(v[3], v[11]);
170991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_sub_epi32(v[4], v[12]);
171091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_sub_epi32(v[5], v[13]);
171191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_sub_epi32(v[6], v[14]);
171291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_sub_epi32(v[7], v[15]);
171391037db265ecdd914a26e056cf69207b4f50924ehkuang
171491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
171591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
171691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
171791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
171891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
171991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
172091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
172191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
172291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
172391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
172491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
172591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
172691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
172791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
172891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
172991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
173091037db265ecdd914a26e056cf69207b4f50924ehkuang
173191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
173291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
173391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
173491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
173591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
173691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
173791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
173891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
173991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
174091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
174191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
174291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
174391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
174491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
174591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
174691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
174791037db265ecdd914a26e056cf69207b4f50924ehkuang
174891037db265ecdd914a26e056cf69207b4f50924ehkuang  x[0] = _mm_add_epi16(s[0], s[4]);
174991037db265ecdd914a26e056cf69207b4f50924ehkuang  x[1] = _mm_add_epi16(s[1], s[5]);
175091037db265ecdd914a26e056cf69207b4f50924ehkuang  x[2] = _mm_add_epi16(s[2], s[6]);
175191037db265ecdd914a26e056cf69207b4f50924ehkuang  x[3] = _mm_add_epi16(s[3], s[7]);
175291037db265ecdd914a26e056cf69207b4f50924ehkuang  x[4] = _mm_sub_epi16(s[0], s[4]);
175391037db265ecdd914a26e056cf69207b4f50924ehkuang  x[5] = _mm_sub_epi16(s[1], s[5]);
175491037db265ecdd914a26e056cf69207b4f50924ehkuang  x[6] = _mm_sub_epi16(s[2], s[6]);
175591037db265ecdd914a26e056cf69207b4f50924ehkuang  x[7] = _mm_sub_epi16(s[3], s[7]);
175691037db265ecdd914a26e056cf69207b4f50924ehkuang  x[8] = _mm_packs_epi32(u[0], u[1]);
175791037db265ecdd914a26e056cf69207b4f50924ehkuang  x[9] = _mm_packs_epi32(u[2], u[3]);
175891037db265ecdd914a26e056cf69207b4f50924ehkuang  x[10] = _mm_packs_epi32(u[4], u[5]);
175991037db265ecdd914a26e056cf69207b4f50924ehkuang  x[11] = _mm_packs_epi32(u[6], u[7]);
176091037db265ecdd914a26e056cf69207b4f50924ehkuang  x[12] = _mm_packs_epi32(u[8], u[9]);
176191037db265ecdd914a26e056cf69207b4f50924ehkuang  x[13] = _mm_packs_epi32(u[10], u[11]);
176291037db265ecdd914a26e056cf69207b4f50924ehkuang  x[14] = _mm_packs_epi32(u[12], u[13]);
176391037db265ecdd914a26e056cf69207b4f50924ehkuang  x[15] = _mm_packs_epi32(u[14], u[15]);
176491037db265ecdd914a26e056cf69207b4f50924ehkuang
176591037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 3
176691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
176791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
176891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
176991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
177091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
177191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
177291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
177391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
177491037db265ecdd914a26e056cf69207b4f50924ehkuang
177591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
177691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
177791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
177891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
177991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
178091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
178191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
178291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
178391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
178491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
178591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
178691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
178791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
178891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
178991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
179091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
179191037db265ecdd914a26e056cf69207b4f50924ehkuang
179291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], v[4]);
179391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], v[5]);
179491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], v[6]);
179591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], v[7]);
179691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_sub_epi32(v[0], v[4]);
179791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_sub_epi32(v[1], v[5]);
179891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_sub_epi32(v[2], v[6]);
179991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_sub_epi32(v[3], v[7]);
180091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_add_epi32(v[8], v[12]);
180191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_add_epi32(v[9], v[13]);
180291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_add_epi32(v[10], v[14]);
180391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_add_epi32(v[11], v[15]);
180491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_sub_epi32(v[8], v[12]);
180591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_sub_epi32(v[9], v[13]);
180691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_sub_epi32(v[10], v[14]);
180791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_sub_epi32(v[11], v[15]);
180891037db265ecdd914a26e056cf69207b4f50924ehkuang
180991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
181091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
181191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
181291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
181391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
181491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
181591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
181691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
181791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
181891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
181991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
182091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
182191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
182291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
182391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
182491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
182591037db265ecdd914a26e056cf69207b4f50924ehkuang
182691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
182791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
182891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
182991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
183091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
183191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
183291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
183391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
183491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
183591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
183691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
183791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
183891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
183991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
184091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
184191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
184291037db265ecdd914a26e056cf69207b4f50924ehkuang
184391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[0] = _mm_add_epi16(x[0], x[2]);
184491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[1] = _mm_add_epi16(x[1], x[3]);
184591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[2] = _mm_sub_epi16(x[0], x[2]);
184691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[3] = _mm_sub_epi16(x[1], x[3]);
184791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[4] = _mm_packs_epi32(v[0], v[1]);
184891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[5] = _mm_packs_epi32(v[2], v[3]);
184991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[6] = _mm_packs_epi32(v[4], v[5]);
185091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[7] = _mm_packs_epi32(v[6], v[7]);
185191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[8] = _mm_add_epi16(x[8], x[10]);
185291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[9] = _mm_add_epi16(x[9], x[11]);
185391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[10] = _mm_sub_epi16(x[8], x[10]);
185491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[11] = _mm_sub_epi16(x[9], x[11]);
185591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[12] = _mm_packs_epi32(v[8], v[9]);
185691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[13] = _mm_packs_epi32(v[10], v[11]);
185791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[14] = _mm_packs_epi32(v[12], v[13]);
185891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[15] = _mm_packs_epi32(v[14], v[15]);
185991037db265ecdd914a26e056cf69207b4f50924ehkuang
186091037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 4
186191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
186291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
186391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
186491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
186591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
186691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
186791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
186891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
186991037db265ecdd914a26e056cf69207b4f50924ehkuang
187091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
187191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
187291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
187391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
187491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
187591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
187691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
187791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
187891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
187991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
188091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
188191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
188291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
188391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
188491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
188591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
188691037db265ecdd914a26e056cf69207b4f50924ehkuang
188791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
188891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
188991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
189091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
189191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
189291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
189391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
189491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
189591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
189691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
189791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
189891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
189991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
190091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
190191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
190291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
190391037db265ecdd914a26e056cf69207b4f50924ehkuang
190491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
190591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
190691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
190791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
190891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
190991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
191091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
191191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
191291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
191391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
191491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
191591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
191691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
191791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
191891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
191991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
192091037db265ecdd914a26e056cf69207b4f50924ehkuang
192191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = s[0];
192291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_sub_epi16(kZero, s[8]);
192391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = s[12];
192491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_sub_epi16(kZero, s[4]);
192591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = _mm_packs_epi32(v[4], v[5]);
192691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_packs_epi32(v[12], v[13]);
192791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = _mm_packs_epi32(v[8], v[9]);
192891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_packs_epi32(v[0], v[1]);
192991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[8] = _mm_packs_epi32(v[2], v[3]);
193091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[9] = _mm_packs_epi32(v[10], v[11]);
193191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[10] = _mm_packs_epi32(v[14], v[15]);
193291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[11] = _mm_packs_epi32(v[6], v[7]);
193391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[12] = s[5];
193491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[13] = _mm_sub_epi16(kZero, s[13]);
193591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[14] = s[9];
193691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[15] = _mm_sub_epi16(kZero, s[1]);
193791037db265ecdd914a26e056cf69207b4f50924ehkuang}
193891037db265ecdd914a26e056cf69207b4f50924ehkuang
193991037db265ecdd914a26e056cf69207b4f50924ehkuangvoid idct16_1d_8col(__m128i *in) {
194091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
194191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
194291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
194391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
194491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
194591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
194691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
194791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
194891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
194991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
195091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
195191037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
195291037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
195391037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
195491037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
195591037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
195691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
195791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
195891037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
195991037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
196091037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
196191037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i v[16], u[16], s[16], t[16];
196291037db265ecdd914a26e056cf69207b4f50924ehkuang
196391037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 1
196491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[0] = in[0];
196591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[1] = in[8];
196691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[2] = in[4];
196791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[3] = in[12];
196891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[4] = in[2];
196991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[5] = in[10];
197091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[6] = in[6];
197191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[7] = in[14];
197291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[8] = in[1];
197391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[9] = in[9];
197491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[10] = in[5];
197591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[11] = in[13];
197691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[12] = in[3];
197791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[13] = in[11];
197891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[14] = in[7];
197991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[15] = in[15];
198091037db265ecdd914a26e056cf69207b4f50924ehkuang
198191037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 2
198291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(s[8], s[15]);
198391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(s[8], s[15]);
198491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(s[9], s[14]);
198591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(s[9], s[14]);
198691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_unpacklo_epi16(s[10], s[13]);
198791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_unpackhi_epi16(s[10], s[13]);
198891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_unpacklo_epi16(s[11], s[12]);
198991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_unpackhi_epi16(s[11], s[12]);
199091037db265ecdd914a26e056cf69207b4f50924ehkuang
199191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
199291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
199391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
199491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
199591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
199691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
199791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
199891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
199991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
200091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
200191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
200291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
200391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
200491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
200591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
200691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
200791037db265ecdd914a26e056cf69207b4f50924ehkuang
200891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
200991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
201091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
201191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
201291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
201391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
201491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
201591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
201691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
201791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
201891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
201991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
202091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
202191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
202291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
202391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
202491037db265ecdd914a26e056cf69207b4f50924ehkuang
202591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
202691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
202791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
202891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
202991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
203091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
203191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
203291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
203391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
203491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
203591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
203691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
203791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
203891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
203991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
204091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
204191037db265ecdd914a26e056cf69207b4f50924ehkuang
204291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[8]  = _mm_packs_epi32(u[0], u[1]);
204391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[15] = _mm_packs_epi32(u[2], u[3]);
204491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[9]  = _mm_packs_epi32(u[4], u[5]);
204591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[14] = _mm_packs_epi32(u[6], u[7]);
204691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[10] = _mm_packs_epi32(u[8], u[9]);
204791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[13] = _mm_packs_epi32(u[10], u[11]);
204891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[11] = _mm_packs_epi32(u[12], u[13]);
204991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[12] = _mm_packs_epi32(u[14], u[15]);
205091037db265ecdd914a26e056cf69207b4f50924ehkuang
205191037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 3
205291037db265ecdd914a26e056cf69207b4f50924ehkuang  t[0] = s[0];
205391037db265ecdd914a26e056cf69207b4f50924ehkuang  t[1] = s[1];
205491037db265ecdd914a26e056cf69207b4f50924ehkuang  t[2] = s[2];
205591037db265ecdd914a26e056cf69207b4f50924ehkuang  t[3] = s[3];
205691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(s[4], s[7]);
205791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(s[4], s[7]);
205891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(s[5], s[6]);
205991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(s[5], s[6]);
206091037db265ecdd914a26e056cf69207b4f50924ehkuang
206191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
206291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
206391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
206491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
206591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
206691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
206791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
206891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
206991037db265ecdd914a26e056cf69207b4f50924ehkuang
207091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
207191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
207291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
207391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
207491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
207591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
207691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
207791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
207891037db265ecdd914a26e056cf69207b4f50924ehkuang
207991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
208091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
208191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
208291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
208391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
208491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
208591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
208691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
208791037db265ecdd914a26e056cf69207b4f50924ehkuang
208891037db265ecdd914a26e056cf69207b4f50924ehkuang  t[4] = _mm_packs_epi32(u[0], u[1]);
208991037db265ecdd914a26e056cf69207b4f50924ehkuang  t[7] = _mm_packs_epi32(u[2], u[3]);
209091037db265ecdd914a26e056cf69207b4f50924ehkuang  t[5] = _mm_packs_epi32(u[4], u[5]);
209191037db265ecdd914a26e056cf69207b4f50924ehkuang  t[6] = _mm_packs_epi32(u[6], u[7]);
209291037db265ecdd914a26e056cf69207b4f50924ehkuang  t[8] = _mm_add_epi16(s[8], s[9]);
209391037db265ecdd914a26e056cf69207b4f50924ehkuang  t[9] = _mm_sub_epi16(s[8], s[9]);
209491037db265ecdd914a26e056cf69207b4f50924ehkuang  t[10] = _mm_sub_epi16(s[11], s[10]);
209591037db265ecdd914a26e056cf69207b4f50924ehkuang  t[11] = _mm_add_epi16(s[10], s[11]);
209691037db265ecdd914a26e056cf69207b4f50924ehkuang  t[12] = _mm_add_epi16(s[12], s[13]);
209791037db265ecdd914a26e056cf69207b4f50924ehkuang  t[13] = _mm_sub_epi16(s[12], s[13]);
209891037db265ecdd914a26e056cf69207b4f50924ehkuang  t[14] = _mm_sub_epi16(s[15], s[14]);
209991037db265ecdd914a26e056cf69207b4f50924ehkuang  t[15] = _mm_add_epi16(s[14], s[15]);
210091037db265ecdd914a26e056cf69207b4f50924ehkuang
210191037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 4
210291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(t[0], t[1]);
210391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(t[0], t[1]);
210491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(t[2], t[3]);
210591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(t[2], t[3]);
210691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_unpacklo_epi16(t[9], t[14]);
210791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_unpackhi_epi16(t[9], t[14]);
210891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_unpacklo_epi16(t[10], t[13]);
210991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_unpackhi_epi16(t[10], t[13]);
211091037db265ecdd914a26e056cf69207b4f50924ehkuang
211191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
211291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
211391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
211491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
211591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
211691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
211791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
211891037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
211991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
212091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
212191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
212291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
212391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
212491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
212591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
212691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
212791037db265ecdd914a26e056cf69207b4f50924ehkuang
212891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
212991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
213091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
213191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
213291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
213391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
213491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
213591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
213691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
213791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
213891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
213991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
214091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
214191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
214291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
214391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
214491037db265ecdd914a26e056cf69207b4f50924ehkuang
214591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
214691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
214791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
214891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
214991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
215091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
215191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
215291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
215391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
215491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
215591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
215691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
215791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
215891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
215991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
216091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
216191037db265ecdd914a26e056cf69207b4f50924ehkuang
216291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[0] = _mm_packs_epi32(u[0], u[1]);
216391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[1] = _mm_packs_epi32(u[2], u[3]);
216491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[2] = _mm_packs_epi32(u[4], u[5]);
216591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[3] = _mm_packs_epi32(u[6], u[7]);
216691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[4] = _mm_add_epi16(t[4], t[5]);
216791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[5] = _mm_sub_epi16(t[4], t[5]);
216891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[6] = _mm_sub_epi16(t[7], t[6]);
216991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[7] = _mm_add_epi16(t[6], t[7]);
217091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[8] = t[8];
217191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[15] = t[15];
217291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[9]  = _mm_packs_epi32(u[8], u[9]);
217391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[14] = _mm_packs_epi32(u[10], u[11]);
217491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[10] = _mm_packs_epi32(u[12], u[13]);
217591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[13] = _mm_packs_epi32(u[14], u[15]);
217691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[11] = t[11];
217791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[12] = t[12];
217891037db265ecdd914a26e056cf69207b4f50924ehkuang
217991037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 5
218091037db265ecdd914a26e056cf69207b4f50924ehkuang  t[0] = _mm_add_epi16(s[0], s[3]);
218191037db265ecdd914a26e056cf69207b4f50924ehkuang  t[1] = _mm_add_epi16(s[1], s[2]);
218291037db265ecdd914a26e056cf69207b4f50924ehkuang  t[2] = _mm_sub_epi16(s[1], s[2]);
218391037db265ecdd914a26e056cf69207b4f50924ehkuang  t[3] = _mm_sub_epi16(s[0], s[3]);
218491037db265ecdd914a26e056cf69207b4f50924ehkuang  t[4] = s[4];
218591037db265ecdd914a26e056cf69207b4f50924ehkuang  t[7] = s[7];
218691037db265ecdd914a26e056cf69207b4f50924ehkuang
218791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(s[5], s[6]);
218891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(s[5], s[6]);
218991037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
219091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
219191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
219291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
219391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
219491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
219591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
219691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
219791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
219891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
219991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
220091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
220191037db265ecdd914a26e056cf69207b4f50924ehkuang  t[5] = _mm_packs_epi32(u[0], u[1]);
220291037db265ecdd914a26e056cf69207b4f50924ehkuang  t[6] = _mm_packs_epi32(u[2], u[3]);
220391037db265ecdd914a26e056cf69207b4f50924ehkuang
220491037db265ecdd914a26e056cf69207b4f50924ehkuang  t[8] = _mm_add_epi16(s[8], s[11]);
220591037db265ecdd914a26e056cf69207b4f50924ehkuang  t[9] = _mm_add_epi16(s[9], s[10]);
220691037db265ecdd914a26e056cf69207b4f50924ehkuang  t[10] = _mm_sub_epi16(s[9], s[10]);
220791037db265ecdd914a26e056cf69207b4f50924ehkuang  t[11] = _mm_sub_epi16(s[8], s[11]);
220891037db265ecdd914a26e056cf69207b4f50924ehkuang  t[12] = _mm_sub_epi16(s[15], s[12]);
220991037db265ecdd914a26e056cf69207b4f50924ehkuang  t[13] = _mm_sub_epi16(s[14], s[13]);
221091037db265ecdd914a26e056cf69207b4f50924ehkuang  t[14] = _mm_add_epi16(s[13], s[14]);
221191037db265ecdd914a26e056cf69207b4f50924ehkuang  t[15] = _mm_add_epi16(s[12], s[15]);
221291037db265ecdd914a26e056cf69207b4f50924ehkuang
221391037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 6
221491037db265ecdd914a26e056cf69207b4f50924ehkuang  s[0] = _mm_add_epi16(t[0], t[7]);
221591037db265ecdd914a26e056cf69207b4f50924ehkuang  s[1] = _mm_add_epi16(t[1], t[6]);
221691037db265ecdd914a26e056cf69207b4f50924ehkuang  s[2] = _mm_add_epi16(t[2], t[5]);
221791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[3] = _mm_add_epi16(t[3], t[4]);
221891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[4] = _mm_sub_epi16(t[3], t[4]);
221991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[5] = _mm_sub_epi16(t[2], t[5]);
222091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[6] = _mm_sub_epi16(t[1], t[6]);
222191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[7] = _mm_sub_epi16(t[0], t[7]);
222291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[8] = t[8];
222391037db265ecdd914a26e056cf69207b4f50924ehkuang  s[9] = t[9];
222491037db265ecdd914a26e056cf69207b4f50924ehkuang
222591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_unpacklo_epi16(t[10], t[13]);
222691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_unpackhi_epi16(t[10], t[13]);
222791037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_unpacklo_epi16(t[11], t[12]);
222891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_unpackhi_epi16(t[11], t[12]);
222991037db265ecdd914a26e056cf69207b4f50924ehkuang
223091037db265ecdd914a26e056cf69207b4f50924ehkuang  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
223191037db265ecdd914a26e056cf69207b4f50924ehkuang  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
223291037db265ecdd914a26e056cf69207b4f50924ehkuang  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
223391037db265ecdd914a26e056cf69207b4f50924ehkuang  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
223491037db265ecdd914a26e056cf69207b4f50924ehkuang  v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
223591037db265ecdd914a26e056cf69207b4f50924ehkuang  v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
223691037db265ecdd914a26e056cf69207b4f50924ehkuang  v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
223791037db265ecdd914a26e056cf69207b4f50924ehkuang  v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
223891037db265ecdd914a26e056cf69207b4f50924ehkuang
223991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
224091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
224191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
224291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
224391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
224491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
224591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
224691037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
224791037db265ecdd914a26e056cf69207b4f50924ehkuang
224891037db265ecdd914a26e056cf69207b4f50924ehkuang  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
224991037db265ecdd914a26e056cf69207b4f50924ehkuang  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
225091037db265ecdd914a26e056cf69207b4f50924ehkuang  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
225191037db265ecdd914a26e056cf69207b4f50924ehkuang  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
225291037db265ecdd914a26e056cf69207b4f50924ehkuang  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
225391037db265ecdd914a26e056cf69207b4f50924ehkuang  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
225491037db265ecdd914a26e056cf69207b4f50924ehkuang  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
225591037db265ecdd914a26e056cf69207b4f50924ehkuang  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
225691037db265ecdd914a26e056cf69207b4f50924ehkuang
225791037db265ecdd914a26e056cf69207b4f50924ehkuang  s[10] = _mm_packs_epi32(u[0], u[1]);
225891037db265ecdd914a26e056cf69207b4f50924ehkuang  s[13] = _mm_packs_epi32(u[2], u[3]);
225991037db265ecdd914a26e056cf69207b4f50924ehkuang  s[11] = _mm_packs_epi32(u[4], u[5]);
226091037db265ecdd914a26e056cf69207b4f50924ehkuang  s[12] = _mm_packs_epi32(u[6], u[7]);
226191037db265ecdd914a26e056cf69207b4f50924ehkuang  s[14] = t[14];
226291037db265ecdd914a26e056cf69207b4f50924ehkuang  s[15] = t[15];
226391037db265ecdd914a26e056cf69207b4f50924ehkuang
226491037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 7
226591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_add_epi16(s[0], s[15]);
226691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_add_epi16(s[1], s[14]);
226791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_add_epi16(s[2], s[13]);
226891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_add_epi16(s[3], s[12]);
226991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = _mm_add_epi16(s[4], s[11]);
227091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_add_epi16(s[5], s[10]);
227191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = _mm_add_epi16(s[6], s[9]);
227291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_add_epi16(s[7], s[8]);
227391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[8] = _mm_sub_epi16(s[7], s[8]);
227491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[9] = _mm_sub_epi16(s[6], s[9]);
227591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[10] = _mm_sub_epi16(s[5], s[10]);
227691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[11] = _mm_sub_epi16(s[4], s[11]);
227791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[12] = _mm_sub_epi16(s[3], s[12]);
227891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[13] = _mm_sub_epi16(s[2], s[13]);
227991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[14] = _mm_sub_epi16(s[1], s[14]);
228091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[15] = _mm_sub_epi16(s[0], s[15]);
228191037db265ecdd914a26e056cf69207b4f50924ehkuang}
228291037db265ecdd914a26e056cf69207b4f50924ehkuang
228391037db265ecdd914a26e056cf69207b4f50924ehkuangvoid idct16_1d_sse2(__m128i *in0, __m128i *in1) {
228491037db265ecdd914a26e056cf69207b4f50924ehkuang  array_transpose_16x16(in0, in1);
228591037db265ecdd914a26e056cf69207b4f50924ehkuang  idct16_1d_8col(in0);
228691037db265ecdd914a26e056cf69207b4f50924ehkuang  idct16_1d_8col(in1);
228791037db265ecdd914a26e056cf69207b4f50924ehkuang}
228891037db265ecdd914a26e056cf69207b4f50924ehkuang
228991037db265ecdd914a26e056cf69207b4f50924ehkuangvoid iadst16_1d_sse2(__m128i *in0, __m128i *in1) {
229091037db265ecdd914a26e056cf69207b4f50924ehkuang  array_transpose_16x16(in0, in1);
229191037db265ecdd914a26e056cf69207b4f50924ehkuang  iadst16_1d_8col(in0);
229291037db265ecdd914a26e056cf69207b4f50924ehkuang  iadst16_1d_8col(in1);
229391037db265ecdd914a26e056cf69207b4f50924ehkuang}
229491037db265ecdd914a26e056cf69207b4f50924ehkuang
229591037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void load_buffer_8x16(int16_t *input, __m128i *in) {
229691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0]  = _mm_load_si128((__m128i *)(input + 0 * 16));
229791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1]  = _mm_load_si128((__m128i *)(input + 1 * 16));
229891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2]  = _mm_load_si128((__m128i *)(input + 2 * 16));
229991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3]  = _mm_load_si128((__m128i *)(input + 3 * 16));
230091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4]  = _mm_load_si128((__m128i *)(input + 4 * 16));
230191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5]  = _mm_load_si128((__m128i *)(input + 5 * 16));
230291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6]  = _mm_load_si128((__m128i *)(input + 6 * 16));
230391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7]  = _mm_load_si128((__m128i *)(input + 7 * 16));
230491037db265ecdd914a26e056cf69207b4f50924ehkuang
230591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[8]  = _mm_load_si128((__m128i *)(input + 8 * 16));
230691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[9]  = _mm_load_si128((__m128i *)(input + 9 * 16));
230791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[10]  = _mm_load_si128((__m128i *)(input + 10 * 16));
230891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[11]  = _mm_load_si128((__m128i *)(input + 11 * 16));
230991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[12]  = _mm_load_si128((__m128i *)(input + 12 * 16));
231091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[13]  = _mm_load_si128((__m128i *)(input + 13 * 16));
231191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[14]  = _mm_load_si128((__m128i *)(input + 14 * 16));
231291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[15]  = _mm_load_si128((__m128i *)(input + 15 * 16));
231391037db265ecdd914a26e056cf69207b4f50924ehkuang}
231491037db265ecdd914a26e056cf69207b4f50924ehkuang
231591037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
231691037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i final_rounding = _mm_set1_epi16(1<<5);
231791037db265ecdd914a26e056cf69207b4f50924ehkuang  const __m128i zero = _mm_setzero_si128();
231891037db265ecdd914a26e056cf69207b4f50924ehkuang  // Final rounding and shift
231991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_adds_epi16(in[0], final_rounding);
232091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_adds_epi16(in[1], final_rounding);
232191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_adds_epi16(in[2], final_rounding);
232291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_adds_epi16(in[3], final_rounding);
232391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = _mm_adds_epi16(in[4], final_rounding);
232491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_adds_epi16(in[5], final_rounding);
232591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = _mm_adds_epi16(in[6], final_rounding);
232691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_adds_epi16(in[7], final_rounding);
232791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[8] = _mm_adds_epi16(in[8], final_rounding);
232891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[9] = _mm_adds_epi16(in[9], final_rounding);
232991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[10] = _mm_adds_epi16(in[10], final_rounding);
233091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[11] = _mm_adds_epi16(in[11], final_rounding);
233191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[12] = _mm_adds_epi16(in[12], final_rounding);
233291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[13] = _mm_adds_epi16(in[13], final_rounding);
233391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[14] = _mm_adds_epi16(in[14], final_rounding);
233491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[15] = _mm_adds_epi16(in[15], final_rounding);
233591037db265ecdd914a26e056cf69207b4f50924ehkuang
233691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[0] = _mm_srai_epi16(in[0], 6);
233791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[1] = _mm_srai_epi16(in[1], 6);
233891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[2] = _mm_srai_epi16(in[2], 6);
233991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[3] = _mm_srai_epi16(in[3], 6);
234091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[4] = _mm_srai_epi16(in[4], 6);
234191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[5] = _mm_srai_epi16(in[5], 6);
234291037db265ecdd914a26e056cf69207b4f50924ehkuang  in[6] = _mm_srai_epi16(in[6], 6);
234391037db265ecdd914a26e056cf69207b4f50924ehkuang  in[7] = _mm_srai_epi16(in[7], 6);
234491037db265ecdd914a26e056cf69207b4f50924ehkuang  in[8] = _mm_srai_epi16(in[8], 6);
234591037db265ecdd914a26e056cf69207b4f50924ehkuang  in[9] = _mm_srai_epi16(in[9], 6);
234691037db265ecdd914a26e056cf69207b4f50924ehkuang  in[10] = _mm_srai_epi16(in[10], 6);
234791037db265ecdd914a26e056cf69207b4f50924ehkuang  in[11] = _mm_srai_epi16(in[11], 6);
234891037db265ecdd914a26e056cf69207b4f50924ehkuang  in[12] = _mm_srai_epi16(in[12], 6);
234991037db265ecdd914a26e056cf69207b4f50924ehkuang  in[13] = _mm_srai_epi16(in[13], 6);
235091037db265ecdd914a26e056cf69207b4f50924ehkuang  in[14] = _mm_srai_epi16(in[14], 6);
235191037db265ecdd914a26e056cf69207b4f50924ehkuang  in[15] = _mm_srai_epi16(in[15], 6);
235291037db265ecdd914a26e056cf69207b4f50924ehkuang
235391037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[0]);
235491037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[1]);
235591037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[2]);
235691037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[3]);
235791037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[4]);
235891037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[5]);
235991037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[6]);
236091037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[7]);
236191037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[8]);
236291037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[9]);
236391037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[10]);
236491037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[11]);
236591037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[12]);
236691037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[13]);
236791037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[14]);
236891037db265ecdd914a26e056cf69207b4f50924ehkuang  RECON_AND_STORE(dest, in[15]);
236991037db265ecdd914a26e056cf69207b4f50924ehkuang}
237091037db265ecdd914a26e056cf69207b4f50924ehkuang
237191037db265ecdd914a26e056cf69207b4f50924ehkuangvoid vp9_short_iht16x16_add_sse2(int16_t *input, uint8_t *dest, int stride,
237291037db265ecdd914a26e056cf69207b4f50924ehkuang                                 int tx_type) {
237391037db265ecdd914a26e056cf69207b4f50924ehkuang  __m128i in0[16], in1[16];
237491037db265ecdd914a26e056cf69207b4f50924ehkuang
237591037db265ecdd914a26e056cf69207b4f50924ehkuang  load_buffer_8x16(input, in0);
237691037db265ecdd914a26e056cf69207b4f50924ehkuang  input += 8;
237791037db265ecdd914a26e056cf69207b4f50924ehkuang  load_buffer_8x16(input, in1);
237891037db265ecdd914a26e056cf69207b4f50924ehkuang
237991037db265ecdd914a26e056cf69207b4f50924ehkuang  switch (tx_type) {
238091037db265ecdd914a26e056cf69207b4f50924ehkuang    case 0:  // DCT_DCT
238191037db265ecdd914a26e056cf69207b4f50924ehkuang      idct16_1d_sse2(in0, in1);
238291037db265ecdd914a26e056cf69207b4f50924ehkuang      idct16_1d_sse2(in0, in1);
238391037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
238491037db265ecdd914a26e056cf69207b4f50924ehkuang    case 1:  // ADST_DCT
238591037db265ecdd914a26e056cf69207b4f50924ehkuang      idct16_1d_sse2(in0, in1);
238691037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst16_1d_sse2(in0, in1);
238791037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
238891037db265ecdd914a26e056cf69207b4f50924ehkuang    case 2:  // DCT_ADST
238991037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst16_1d_sse2(in0, in1);
239091037db265ecdd914a26e056cf69207b4f50924ehkuang      idct16_1d_sse2(in0, in1);
239191037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
239291037db265ecdd914a26e056cf69207b4f50924ehkuang    case 3:  // ADST_ADST
239391037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst16_1d_sse2(in0, in1);
239491037db265ecdd914a26e056cf69207b4f50924ehkuang      iadst16_1d_sse2(in0, in1);
239591037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
239691037db265ecdd914a26e056cf69207b4f50924ehkuang    default:
239791037db265ecdd914a26e056cf69207b4f50924ehkuang      assert(0);
239891037db265ecdd914a26e056cf69207b4f50924ehkuang      break;
239991037db265ecdd914a26e056cf69207b4f50924ehkuang  }
240091037db265ecdd914a26e056cf69207b4f50924ehkuang
240191037db265ecdd914a26e056cf69207b4f50924ehkuang  write_buffer_8x16(dest, in0, stride);
240291037db265ecdd914a26e056cf69207b4f50924ehkuang  dest += 8;
240391037db265ecdd914a26e056cf69207b4f50924ehkuang  write_buffer_8x16(dest, in1, stride);
240491037db265ecdd914a26e056cf69207b4f50924ehkuang}
240591037db265ecdd914a26e056cf69207b4f50924ehkuang
2406ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangvoid vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest,
2407ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                                     int stride) {
2408ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
2409ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i final_rounding = _mm_set1_epi16(1<<5);
2410ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i zero = _mm_setzero_si128();
2411ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2412ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2413ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
2414ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
2415ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
2416ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
2417ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
2418ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2419ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
2420ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2421ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2422ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
2423ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
2424ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
2425ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2426ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
2427ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2428ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
2429ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
2430ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2431ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
2432ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
2433ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2434ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2435ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
2436ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2437ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero,
2438ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero,
2439ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          in10 = zero, in11 = zero, in12 = zero, in13 = zero,
2440ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          in14 = zero, in15 = zero;
2441ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero,
2442ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero,
2443ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          l12 = zero, l13 = zero, l14 = zero, l15 = zero;
2444ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2445ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
2446ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
2447ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_8_0, stp1_12_0;
2448ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
2449ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
2450ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2451ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int i;
2452ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // 1-D idct. Load input data.
2453ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in0 = _mm_load_si128((__m128i *)input);
2454ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
2455ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in1 = _mm_load_si128((__m128i *)(input + 8 * 2));
2456ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in9 = _mm_load_si128((__m128i *)(input + 8 * 3));
2457ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in2 = _mm_load_si128((__m128i *)(input + 8 * 4));
2458ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in10 = _mm_load_si128((__m128i *)(input + 8 * 5));
2459ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in3 = _mm_load_si128((__m128i *)(input + 8 * 6));
2460ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  in11 = _mm_load_si128((__m128i *)(input + 8 * 7));
2461ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2462ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3);
2463ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11);
2464ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2465ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage2
2466ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {
2467ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_1_15 = _mm_unpackhi_epi16(in0, in11);
2468ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_9_7 = _mm_unpackhi_epi16(in8, in3);
2469ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_5_11 = _mm_unpackhi_epi16(in2, in9);
2470ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_13_3 = _mm_unpackhi_epi16(in10, in1);
2471ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2472ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
2473ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
2474ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_madd_epi16(lo_9_7, stg2_2);
2475ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_madd_epi16(lo_9_7, stg2_3);
2476ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_madd_epi16(lo_5_11, stg2_4);
2477ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_madd_epi16(lo_5_11, stg2_5);
2478ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
2479ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
2480ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2481ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding);
2482ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding);
2483ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_add_epi32(tmp4, rounding);
2484ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_add_epi32(tmp6, rounding);
2485ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_add_epi32(tmp1, rounding);
2486ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_add_epi32(tmp3, rounding);
2487ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp5 = _mm_add_epi32(tmp5, rounding);
2488ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp7 = _mm_add_epi32(tmp7, rounding);
2489ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2490ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2491ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2492ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2493ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2494ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2495ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2496ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2497ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2498ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2499ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_8 = _mm_packs_epi32(tmp0, zero);
2500ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_15 = _mm_packs_epi32(tmp2, zero);
2501ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_9 = _mm_packs_epi32(tmp4, zero);
2502ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_14 = _mm_packs_epi32(tmp6, zero);
2503ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2504ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_10 = _mm_packs_epi32(tmp1, zero);
2505ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_13 = _mm_packs_epi32(tmp3, zero);
2506ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_11 = _mm_packs_epi32(tmp5, zero);
2507ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_12 = _mm_packs_epi32(tmp7, zero);
2508ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
2509ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2510ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage3
2511ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {
2512ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_2_14 = _mm_unpacklo_epi16(in1, in11);
2513ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_10_6 = _mm_unpacklo_epi16(in9, in3);
2514ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2515ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
2516ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
2517ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_madd_epi16(lo_10_6, stg3_2);
2518ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_madd_epi16(lo_10_6, stg3_3);
2519ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2520ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding);
2521ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding);
2522ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_add_epi32(tmp4, rounding);
2523ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_add_epi32(tmp6, rounding);
2524ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2525ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2526ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2527ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2528ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2529ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2530ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_4 = _mm_packs_epi32(tmp0, zero);
2531ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_7 = _mm_packs_epi32(tmp2, zero);
2532ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_5 = _mm_packs_epi32(tmp4, zero);
2533ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_6 = _mm_packs_epi32(tmp6, zero);
2534ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2535ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);
2536ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);
2537ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);
2538ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_11 = _mm_add_epi16(stp2_11, stp2_10);
2539ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2540ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13);
2541ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);
2542ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);
2543ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_15 = _mm_add_epi16(stp2_15, stp2_14);
2544ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
2545ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2546ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage4
2547ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {
2548ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8);
2549ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_4_12 = _mm_unpacklo_epi16(in2, in10);
2550ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
2551ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
2552ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2553ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
2554ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
2555ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_madd_epi16(lo_4_12, stg4_2);
2556ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_madd_epi16(lo_4_12, stg4_3);
2557ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
2558ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
2559ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
2560ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
2561ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2562ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding);
2563ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding);
2564ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_add_epi32(tmp4, rounding);
2565ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_add_epi32(tmp6, rounding);
2566ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_add_epi32(tmp1, rounding);
2567ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_add_epi32(tmp3, rounding);
2568ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp5 = _mm_add_epi32(tmp5, rounding);
2569ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp7 = _mm_add_epi32(tmp7, rounding);
2570ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2571ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2572ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2573ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2574ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2575ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2576ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2577ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2578ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2579ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2580ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_0 = _mm_packs_epi32(tmp0, zero);
2581ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_1 = _mm_packs_epi32(tmp2, zero);
2582ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_2 = _mm_packs_epi32(tmp4, zero);
2583ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_3 = _mm_packs_epi32(tmp6, zero);
2584ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_9 = _mm_packs_epi32(tmp1, zero);
2585ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_14 = _mm_packs_epi32(tmp3, zero);
2586ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_10 = _mm_packs_epi32(tmp5, zero);
2587ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_13 = _mm_packs_epi32(tmp7, zero);
2588ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2589ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
2590ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
2591ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
2592ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
2593ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
2594ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2595ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage5 and Stage6
2596ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {
2597ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
2598ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
2599ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
2600ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
2601ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2602ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);
2603ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
2604ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
2605ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);
2606ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2607ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);
2608ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
2609ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
2610ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);
2611ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
2612ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2613ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage6
2614ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {
2615ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);
2616ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
2617ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
2618ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2619ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
2620ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
2621ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
2622ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
2623ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
2624ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
2625ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2626ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_add_epi32(tmp1, rounding);
2627ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_add_epi32(tmp3, rounding);
2628ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_add_epi32(tmp0, rounding);
2629ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_add_epi32(tmp2, rounding);
2630ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_add_epi32(tmp4, rounding);
2631ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_add_epi32(tmp6, rounding);
2632ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2633ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2634ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2635ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2636ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2637ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2638ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2639ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2640ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_5 = _mm_packs_epi32(tmp1, zero);
2641ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp1_6 = _mm_packs_epi32(tmp3, zero);
2642ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_10 = _mm_packs_epi32(tmp0, zero);
2643ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_13 = _mm_packs_epi32(tmp2, zero);
2644ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_11 = _mm_packs_epi32(tmp4, zero);
2645ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_12 = _mm_packs_epi32(tmp6, zero);
2646ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2647ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);
2648ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);
2649ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
2650ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);
2651ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);
2652ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
2653ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);
2654ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);
2655ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
2656ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2657ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage7. Left 8x16 only.
2658ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l0 = _mm_add_epi16(stp2_0, stp1_15);
2659ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l1 = _mm_add_epi16(stp2_1, stp1_14);
2660ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l2 = _mm_add_epi16(stp2_2, stp2_13);
2661ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l3 = _mm_add_epi16(stp2_3, stp2_12);
2662ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l4 = _mm_add_epi16(stp2_4, stp2_11);
2663ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l5 = _mm_add_epi16(stp2_5, stp2_10);
2664ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l6 = _mm_add_epi16(stp2_6, stp1_9);
2665ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l7 = _mm_add_epi16(stp2_7, stp1_8);
2666ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l8 = _mm_sub_epi16(stp2_7, stp1_8);
2667ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l9 = _mm_sub_epi16(stp2_6, stp1_9);
2668ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l10 = _mm_sub_epi16(stp2_5, stp2_10);
2669ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l11 = _mm_sub_epi16(stp2_4, stp2_11);
2670ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l12 = _mm_sub_epi16(stp2_3, stp2_12);
2671ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l13 = _mm_sub_epi16(stp2_2, stp2_13);
2672ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l14 = _mm_sub_epi16(stp2_1, stp1_14);
2673ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  l15 = _mm_sub_epi16(stp2_0, stp1_15);
2674ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2675ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // 2-D idct. We do 2 8x16 blocks.
2676ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 2; i++) {
2677ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    if (i == 0)
2678ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_4X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4,
2679ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in5, in6, in7);
2680ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2681ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    if (i == 1)
2682ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_4X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3,
2683ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in4, in5, in6, in7);
2684ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2685ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero;
2686ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2687ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    IDCT16x16_1D
2688ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2689ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // Stage7
2690ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in0 = _mm_add_epi16(stp2_0, stp1_15);
2691ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in1 = _mm_add_epi16(stp2_1, stp1_14);
2692ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in2 = _mm_add_epi16(stp2_2, stp2_13);
2693ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in3 = _mm_add_epi16(stp2_3, stp2_12);
2694ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in4 = _mm_add_epi16(stp2_4, stp2_11);
2695ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in5 = _mm_add_epi16(stp2_5, stp2_10);
2696ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in6 = _mm_add_epi16(stp2_6, stp1_9);
2697ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in7 = _mm_add_epi16(stp2_7, stp1_8);
2698ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in8 = _mm_sub_epi16(stp2_7, stp1_8);
2699ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in9 = _mm_sub_epi16(stp2_6, stp1_9);
2700ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in10 = _mm_sub_epi16(stp2_5, stp2_10);
2701ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in11 = _mm_sub_epi16(stp2_4, stp2_11);
2702ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in12 = _mm_sub_epi16(stp2_3, stp2_12);
2703ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in13 = _mm_sub_epi16(stp2_2, stp2_13);
2704ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in14 = _mm_sub_epi16(stp2_1, stp1_14);
2705ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in15 = _mm_sub_epi16(stp2_0, stp1_15);
2706ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2707ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // Final rounding and shift
2708ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in0 = _mm_adds_epi16(in0, final_rounding);
2709ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in1 = _mm_adds_epi16(in1, final_rounding);
2710ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in2 = _mm_adds_epi16(in2, final_rounding);
2711ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in3 = _mm_adds_epi16(in3, final_rounding);
2712ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in4 = _mm_adds_epi16(in4, final_rounding);
2713ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in5 = _mm_adds_epi16(in5, final_rounding);
2714ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in6 = _mm_adds_epi16(in6, final_rounding);
2715ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in7 = _mm_adds_epi16(in7, final_rounding);
2716ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in8 = _mm_adds_epi16(in8, final_rounding);
2717ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in9 = _mm_adds_epi16(in9, final_rounding);
2718ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in10 = _mm_adds_epi16(in10, final_rounding);
2719ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in11 = _mm_adds_epi16(in11, final_rounding);
2720ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in12 = _mm_adds_epi16(in12, final_rounding);
2721ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in13 = _mm_adds_epi16(in13, final_rounding);
2722ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in14 = _mm_adds_epi16(in14, final_rounding);
2723ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in15 = _mm_adds_epi16(in15, final_rounding);
2724ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2725ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in0 = _mm_srai_epi16(in0, 6);
2726ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in1 = _mm_srai_epi16(in1, 6);
2727ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in2 = _mm_srai_epi16(in2, 6);
2728ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in3 = _mm_srai_epi16(in3, 6);
2729ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in4 = _mm_srai_epi16(in4, 6);
2730ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in5 = _mm_srai_epi16(in5, 6);
2731ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in6 = _mm_srai_epi16(in6, 6);
2732ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in7 = _mm_srai_epi16(in7, 6);
2733ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in8 = _mm_srai_epi16(in8, 6);
2734ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in9 = _mm_srai_epi16(in9, 6);
2735ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in10 = _mm_srai_epi16(in10, 6);
2736ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in11 = _mm_srai_epi16(in11, 6);
2737ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in12 = _mm_srai_epi16(in12, 6);
2738ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in13 = _mm_srai_epi16(in13, 6);
2739ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in14 = _mm_srai_epi16(in14, 6);
2740ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in15 = _mm_srai_epi16(in15, 6);
2741ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2742ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in0);
2743ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in1);
2744ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in2);
2745ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in3);
2746ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in4);
2747ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in5);
2748ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in6);
2749ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in7);
2750ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in8);
2751ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in9);
2752ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in10);
2753ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in11);
2754ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in12);
2755ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in13);
2756ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in14);
2757ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RECON_AND_STORE(dest, in15);
2758ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2759ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    dest += 8 - (stride * 16);
2760ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
2761ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
2762ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2763ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangvoid vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
2764ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
2765ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i final_rounding = _mm_set1_epi16(1<<5);
2766ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2767ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // idct constants for each stage
2768ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
2769ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
2770ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
2771ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
2772ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
2773ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
2774ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
2775ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
2776ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
2777ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
2778ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
2779ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
2780ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
2781ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
2782ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
2783ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
2784ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2785ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2786ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
2787ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
2788ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
2789ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
2790ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
2791ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2792ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
2793ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2794ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2795ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
2796ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
2797ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
2798ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
2799ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
2800ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
2801ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
2802ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
2803ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
2804ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2805ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
2806ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2807ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
2808ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
2809ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2810ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
2811ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
2812ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2813ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
2814ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2815ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
2816ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
2817ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          in24, in25, in26, in27, in28, in29, in30, in31;
2818ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i col[128];
2819ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
2820ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
2821ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
2822ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
2823ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp1_30, stp1_31;
2824ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
2825ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
2826ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
2827ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
2828ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          stp2_30, stp2_31;
2829ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2830ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int i, j;
2831ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2832ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
2833ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 8; i++) {
2834ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    if (i < 4) {
2835ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // First 1-D idct
2836ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Load input data.
2837ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in0 = _mm_load_si128((__m128i *)input);
2838ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
2839ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in16 = _mm_load_si128((__m128i *)(input + 8 * 2));
2840ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in24 = _mm_load_si128((__m128i *)(input + 8 * 3));
2841ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in1 = _mm_load_si128((__m128i *)(input + 8 * 4));
2842ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in9 = _mm_load_si128((__m128i *)(input + 8 * 5));
2843ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in17 = _mm_load_si128((__m128i *)(input + 8 * 6));
2844ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in25 = _mm_load_si128((__m128i *)(input + 8 * 7));
2845ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in2 = _mm_load_si128((__m128i *)(input + 8 * 8));
2846ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in10 = _mm_load_si128((__m128i *)(input + 8 * 9));
2847ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in18 = _mm_load_si128((__m128i *)(input + 8 * 10));
2848ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in26 = _mm_load_si128((__m128i *)(input + 8 * 11));
2849ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in3 = _mm_load_si128((__m128i *)(input + 8 * 12));
2850ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in11 = _mm_load_si128((__m128i *)(input + 8 * 13));
2851ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in19 = _mm_load_si128((__m128i *)(input + 8 * 14));
2852ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in27 = _mm_load_si128((__m128i *)(input + 8 * 15));
2853ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2854ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in4 = _mm_load_si128((__m128i *)(input + 8 * 16));
2855ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in12 = _mm_load_si128((__m128i *)(input + 8 * 17));
2856ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in20 = _mm_load_si128((__m128i *)(input + 8 * 18));
2857ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in28 = _mm_load_si128((__m128i *)(input + 8 * 19));
2858ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in5 = _mm_load_si128((__m128i *)(input + 8 * 20));
2859ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in13 = _mm_load_si128((__m128i *)(input + 8 * 21));
2860ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in21 = _mm_load_si128((__m128i *)(input + 8 * 22));
2861ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in29 = _mm_load_si128((__m128i *)(input + 8 * 23));
2862ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in6 = _mm_load_si128((__m128i *)(input + 8 * 24));
2863ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in14 = _mm_load_si128((__m128i *)(input + 8 * 25));
2864ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in22 = _mm_load_si128((__m128i *)(input + 8 * 26));
2865ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in30 = _mm_load_si128((__m128i *)(input + 8 * 27));
2866ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in7 = _mm_load_si128((__m128i *)(input + 8 * 28));
2867ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in15 = _mm_load_si128((__m128i *)(input + 8 * 29));
2868ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in23 = _mm_load_si128((__m128i *)(input + 8 * 30));
2869ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in31 = _mm_load_si128((__m128i *)(input + 8 * 31));
2870ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2871ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      input += 256;
2872ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2873ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Transpose 32x8 block to 8x32 block
2874ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
2875ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in4, in5, in6, in7);
2876ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
2877ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in10, in11, in12, in13, in14, in15);
2878ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
2879ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in18, in19, in20, in21, in22, in23);
2880ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
2881ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in26, in27, in28, in29, in30, in31);
2882ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    } else {
2883ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Second 1-D idct
2884ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      j = i - 4;
2885ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2886ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Transpose 32x8 block to 8x32 block
2887ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
2888ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
2889ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
2890ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in5, in6, in7);
2891ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      j += 4;
2892ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
2893ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
2894ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
2895ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in11, in12, in13, in14, in15);
2896ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      j += 4;
2897ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
2898ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
2899ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
2900ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in19, in20, in21, in22, in23);
2901ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      j += 4;
2902ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
2903ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
2904ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
2905ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    in28, in29, in30, in31);
2906ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    }
2907ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2908ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // Stage1
2909ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    {
2910ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31);
2911ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31);
2912ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15);
2913ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15);
2914ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2915ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23);
2916ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23);
2917ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7);
2918ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7);
2919ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2920ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27);
2921ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27);
2922ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11);
2923ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11);
2924ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2925ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19);
2926ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19);
2927ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3);
2928ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3);
2929ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2930ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0,
2931ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stg1_1, stg1_2, stg1_3, stp1_16, stp1_31,
2932ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stp1_17, stp1_30)
2933ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4,
2934ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stg1_5, stg1_6, stg1_7, stp1_18, stp1_29,
2935ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stp1_19, stp1_28)
2936ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8,
2937ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stg1_9, stg1_10, stg1_11, stp1_20, stp1_27,
2938ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stp1_21, stp1_26)
2939ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12,
2940ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stg1_13, stg1_14, stg1_15, stp1_22, stp1_25,
2941ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stp1_23, stp1_24)
2942ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    }
2943ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2944ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // Stage2
2945ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    {
2946ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30);
2947ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30);
2948ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14);
2949ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14);
2950ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2951ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22);
2952ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22);
2953ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6);
2954ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6);
2955ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2956ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0,
2957ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9,
2958ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stp2_14)
2959ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4,
2960ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stg2_5, stg2_6, stg2_7, stp2_10, stp2_13,
2961ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stp2_11, stp2_12)
2962ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2963ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_16 = _mm_add_epi16(stp1_16, stp1_17);
2964ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_17 = _mm_sub_epi16(stp1_16, stp1_17);
2965ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_18 = _mm_sub_epi16(stp1_19, stp1_18);
2966ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_19 = _mm_add_epi16(stp1_19, stp1_18);
2967ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2968ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_20 = _mm_add_epi16(stp1_20, stp1_21);
2969ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_21 = _mm_sub_epi16(stp1_20, stp1_21);
2970ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_22 = _mm_sub_epi16(stp1_23, stp1_22);
2971ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_23 = _mm_add_epi16(stp1_23, stp1_22);
2972ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2973ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_24 = _mm_add_epi16(stp1_24, stp1_25);
2974ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_25 = _mm_sub_epi16(stp1_24, stp1_25);
2975ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_26 = _mm_sub_epi16(stp1_27, stp1_26);
2976ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_27 = _mm_add_epi16(stp1_27, stp1_26);
2977ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2978ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_28 = _mm_add_epi16(stp1_28, stp1_29);
2979ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_29 = _mm_sub_epi16(stp1_28, stp1_29);
2980ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_30 = _mm_sub_epi16(stp1_31, stp1_30);
2981ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_31 = _mm_add_epi16(stp1_31, stp1_30);
2982ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    }
2983ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2984ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // Stage3
2985ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    {
2986ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28);
2987ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28);
2988ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12);
2989ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12);
2990ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2991ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30);
2992ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30);
2993ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);
2994ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);
2995ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2996ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
2997ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
2998ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);
2999ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);
3000ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3001ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0,
3002ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5,
3003ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stp1_6)
3004ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3005ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_8 = _mm_add_epi16(stp2_8, stp2_9);
3006ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);
3007ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);
3008ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_11 = _mm_add_epi16(stp2_11, stp2_10);
3009ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_12 = _mm_add_epi16(stp2_12, stp2_13);
3010ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);
3011ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);
3012ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_15 = _mm_add_epi16(stp2_15, stp2_14);
3013ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3014ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,
3015ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stg3_5, stg3_6, stg3_4, stp1_17, stp1_30,
3016ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stp1_18, stp1_29)
3017ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,
3018ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stg3_9, stg3_10, stg3_8, stp1_21, stp1_26,
3019ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stp1_22, stp1_25)
3020ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3021ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_16 = stp2_16;
3022ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_31 = stp2_31;
3023ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_19 = stp2_19;
3024ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_20 = stp2_20;
3025ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_23 = stp2_23;
3026ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_24 = stp2_24;
3027ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_27 = stp2_27;
3028ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_28 = stp2_28;
3029ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    }
3030ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3031ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // Stage4
3032ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    {
3033ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16);
3034ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16);
3035ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24);
3036ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24);
3037ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3038ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
3039ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);
3040ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
3041ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
3042ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3043ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0,
3044ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stg4_1, stg4_2, stg4_3, stp2_0, stp2_1,
3045ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stp2_2, stp2_3)
3046ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3047ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
3048ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
3049ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
3050ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
3051ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3052ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,
3053ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stg4_5, stg4_6, stg4_4, stp2_9, stp2_14,
3054ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stp2_10, stp2_13)
3055ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3056ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_8 = stp1_8;
3057ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_15 = stp1_15;
3058ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_11 = stp1_11;
3059ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_12 = stp1_12;
3060ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3061ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_16 = _mm_add_epi16(stp1_16, stp1_19);
3062ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_17 = _mm_add_epi16(stp1_17, stp1_18);
3063ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);
3064ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);
3065ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);
3066ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);
3067ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_22 = _mm_add_epi16(stp1_22, stp1_21);
3068ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_23 = _mm_add_epi16(stp1_23, stp1_20);
3069ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3070ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_24 = _mm_add_epi16(stp1_24, stp1_27);
3071ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_25 = _mm_add_epi16(stp1_25, stp1_26);
3072ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);
3073ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);
3074ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);
3075ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);
3076ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_30 = _mm_add_epi16(stp1_29, stp1_30);
3077ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_31 = _mm_add_epi16(stp1_28, stp1_31);
3078ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    }
3079ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3080ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // Stage5
3081ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    {
3082ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);
3083ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);
3084ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);
3085ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);
3086ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3087ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);
3088ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);
3089ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);
3090ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);
3091ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3092ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
3093ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
3094ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3095ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
3096ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
3097ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
3098ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
3099ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);
3101ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);
3102ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);
3103ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);
3104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3105ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp0 = _mm_add_epi32(tmp0, rounding);
3106ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp1 = _mm_add_epi32(tmp1, rounding);
3107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp2 = _mm_add_epi32(tmp2, rounding);
3108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp3 = _mm_add_epi32(tmp3, rounding);
3109ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3110ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
3111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
3112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
3113ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
3114ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3115ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_5 = _mm_packs_epi32(tmp0, tmp1);
3116ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_6 = _mm_packs_epi32(tmp2, tmp3);
3117ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3118ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_4 = stp2_4;
3119ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_7 = stp2_7;
3120ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3121ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_8 = _mm_add_epi16(stp2_8, stp2_11);
3122ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
3123ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
3124ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);
3125ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);
3126ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
3127ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
3128ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_15 = _mm_add_epi16(stp2_15, stp2_12);
3129ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3130ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_16 = stp2_16;
3131ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_17 = stp2_17;
3132ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3133ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,
3134ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stg4_5, stg4_4, stg4_5, stp1_18, stp1_29,
3135ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stp1_19, stp1_28)
3136ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,
3137ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stg4_4, stg4_6, stg4_4, stp1_20, stp1_27,
3138ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stp1_21, stp1_26)
3139ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3140ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_22 = stp2_22;
3141ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_23 = stp2_23;
3142ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_24 = stp2_24;
3143ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_25 = stp2_25;
3144ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_30 = stp2_30;
3145ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_31 = stp2_31;
3146ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    }
3147ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3148ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // Stage6
3149ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    {
3150ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
3151ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
3152ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
3153ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);
3154ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3155ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_0 = _mm_add_epi16(stp1_0, stp1_7);
3156ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_1 = _mm_add_epi16(stp1_1, stp1_6);
3157ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
3158ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_3 = _mm_add_epi16(stp1_3, stp1_4);
3159ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);
3160ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
3161ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);
3162ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);
3163ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3164ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_8 = stp1_8;
3165ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_9 = stp1_9;
3166ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_14 = stp1_14;
3167ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_15 = stp1_15;
3168ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3169ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12,
3170ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stg6_0, stg4_0, stg6_0, stg4_0, stp2_10,
3171ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stp2_13, stp2_11, stp2_12)
3172ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3173ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_16 = _mm_add_epi16(stp1_16, stp1_23);
3174ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_17 = _mm_add_epi16(stp1_17, stp1_22);
3175ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_18 = _mm_add_epi16(stp1_18, stp1_21);
3176ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_19 = _mm_add_epi16(stp1_19, stp1_20);
3177ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);
3178ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);
3179ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);
3180ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);
3181ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3182ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);
3183ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);
3184ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);
3185ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);
3186ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_28 = _mm_add_epi16(stp1_27, stp1_28);
3187ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_29 = _mm_add_epi16(stp1_26, stp1_29);
3188ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_30 = _mm_add_epi16(stp1_25, stp1_30);
3189ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp2_31 = _mm_add_epi16(stp1_24, stp1_31);
3190ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    }
3191ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3192ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // Stage7
3193ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    {
3194ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);
3195ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);
3196ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
3197ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
3198ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3199ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);
3200ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);
3201ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);
3202ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);
3203ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3204ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_0 = _mm_add_epi16(stp2_0, stp2_15);
3205ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_1 = _mm_add_epi16(stp2_1, stp2_14);
3206ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_2 = _mm_add_epi16(stp2_2, stp2_13);
3207ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_3 = _mm_add_epi16(stp2_3, stp2_12);
3208ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_4 = _mm_add_epi16(stp2_4, stp2_11);
3209ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_5 = _mm_add_epi16(stp2_5, stp2_10);
3210ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_6 = _mm_add_epi16(stp2_6, stp2_9);
3211ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_7 = _mm_add_epi16(stp2_7, stp2_8);
3212ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);
3213ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);
3214ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);
3215ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);
3216ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);
3217ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);
3218ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);
3219ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);
3220ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3221ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_16 = stp2_16;
3222ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_17 = stp2_17;
3223ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_18 = stp2_18;
3224ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_19 = stp2_19;
3225ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3226ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,
3227ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stg4_0, stg6_0, stg4_0, stp1_20, stp1_27,
3228ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stp1_21, stp1_26)
3229ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,
3230ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stg4_0, stg6_0, stg4_0, stp1_22, stp1_25,
3231ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                             stp1_23, stp1_24)
3232ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3233ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_28 = stp2_28;
3234ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_29 = stp2_29;
3235ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_30 = stp2_30;
3236ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      stp1_31 = stp2_31;
3237ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    }
3238ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3239ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // final stage
3240ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    if (i < 4) {
3241ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // 1_D: Store 32 intermediate results for each 8x32 block.
3242ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
3243ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
3244ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
3245ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
3246ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
3247ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
3248ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
3249ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
3250ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
3251ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
3252ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
3253ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
3254ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
3255ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
3256ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
3257ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
3258ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
3259ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
3260ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
3261ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
3262ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
3263ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
3264ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
3265ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
3266ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
3267ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
3268ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
3269ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
3270ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
3271ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
3272ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
3273ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      col[i * 32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
3274ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    } else {
3275ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      const __m128i zero = _mm_setzero_si128();
3276ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3277ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // 2_D: Calculate the results and store them to destination.
3278ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in0 = _mm_add_epi16(stp1_0, stp1_31);
3279ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in1 = _mm_add_epi16(stp1_1, stp1_30);
3280ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in2 = _mm_add_epi16(stp1_2, stp1_29);
3281ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in3 = _mm_add_epi16(stp1_3, stp1_28);
3282ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in4 = _mm_add_epi16(stp1_4, stp1_27);
3283ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in5 = _mm_add_epi16(stp1_5, stp1_26);
3284ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in6 = _mm_add_epi16(stp1_6, stp1_25);
3285ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in7 = _mm_add_epi16(stp1_7, stp1_24);
3286ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in8 = _mm_add_epi16(stp1_8, stp1_23);
3287ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in9 = _mm_add_epi16(stp1_9, stp1_22);
3288ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in10 = _mm_add_epi16(stp1_10, stp1_21);
3289ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in11 = _mm_add_epi16(stp1_11, stp1_20);
3290ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in12 = _mm_add_epi16(stp1_12, stp1_19);
3291ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in13 = _mm_add_epi16(stp1_13, stp1_18);
3292ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in14 = _mm_add_epi16(stp1_14, stp1_17);
3293ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in15 = _mm_add_epi16(stp1_15, stp1_16);
3294ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in16 = _mm_sub_epi16(stp1_15, stp1_16);
3295ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in17 = _mm_sub_epi16(stp1_14, stp1_17);
3296ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in18 = _mm_sub_epi16(stp1_13, stp1_18);
3297ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in19 = _mm_sub_epi16(stp1_12, stp1_19);
3298ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in20 = _mm_sub_epi16(stp1_11, stp1_20);
3299ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in21 = _mm_sub_epi16(stp1_10, stp1_21);
3300ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in22 = _mm_sub_epi16(stp1_9, stp1_22);
3301ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in23 = _mm_sub_epi16(stp1_8, stp1_23);
3302ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in24 = _mm_sub_epi16(stp1_7, stp1_24);
3303ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in25 = _mm_sub_epi16(stp1_6, stp1_25);
3304ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in26 = _mm_sub_epi16(stp1_5, stp1_26);
3305ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in27 = _mm_sub_epi16(stp1_4, stp1_27);
3306ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in28 = _mm_sub_epi16(stp1_3, stp1_28);
3307ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in29 = _mm_sub_epi16(stp1_2, stp1_29);
3308ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in30 = _mm_sub_epi16(stp1_1, stp1_30);
3309ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in31 = _mm_sub_epi16(stp1_0, stp1_31);
3310ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3311ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Final rounding and shift
3312ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in0 = _mm_adds_epi16(in0, final_rounding);
3313ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in1 = _mm_adds_epi16(in1, final_rounding);
3314ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in2 = _mm_adds_epi16(in2, final_rounding);
3315ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in3 = _mm_adds_epi16(in3, final_rounding);
3316ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in4 = _mm_adds_epi16(in4, final_rounding);
3317ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in5 = _mm_adds_epi16(in5, final_rounding);
3318ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in6 = _mm_adds_epi16(in6, final_rounding);
3319ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in7 = _mm_adds_epi16(in7, final_rounding);
3320ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in8 = _mm_adds_epi16(in8, final_rounding);
3321ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in9 = _mm_adds_epi16(in9, final_rounding);
3322ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in10 = _mm_adds_epi16(in10, final_rounding);
3323ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in11 = _mm_adds_epi16(in11, final_rounding);
3324ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in12 = _mm_adds_epi16(in12, final_rounding);
3325ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in13 = _mm_adds_epi16(in13, final_rounding);
3326ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in14 = _mm_adds_epi16(in14, final_rounding);
3327ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in15 = _mm_adds_epi16(in15, final_rounding);
3328ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in16 = _mm_adds_epi16(in16, final_rounding);
3329ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in17 = _mm_adds_epi16(in17, final_rounding);
3330ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in18 = _mm_adds_epi16(in18, final_rounding);
3331ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in19 = _mm_adds_epi16(in19, final_rounding);
3332ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in20 = _mm_adds_epi16(in20, final_rounding);
3333ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in21 = _mm_adds_epi16(in21, final_rounding);
3334ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in22 = _mm_adds_epi16(in22, final_rounding);
3335ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in23 = _mm_adds_epi16(in23, final_rounding);
3336ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in24 = _mm_adds_epi16(in24, final_rounding);
3337ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in25 = _mm_adds_epi16(in25, final_rounding);
3338ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in26 = _mm_adds_epi16(in26, final_rounding);
3339ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in27 = _mm_adds_epi16(in27, final_rounding);
3340ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in28 = _mm_adds_epi16(in28, final_rounding);
3341ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in29 = _mm_adds_epi16(in29, final_rounding);
3342ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in30 = _mm_adds_epi16(in30, final_rounding);
3343ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in31 = _mm_adds_epi16(in31, final_rounding);
3344ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3345ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in0 = _mm_srai_epi16(in0, 6);
3346ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in1 = _mm_srai_epi16(in1, 6);
3347ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in2 = _mm_srai_epi16(in2, 6);
3348ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in3 = _mm_srai_epi16(in3, 6);
3349ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in4 = _mm_srai_epi16(in4, 6);
3350ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in5 = _mm_srai_epi16(in5, 6);
3351ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in6 = _mm_srai_epi16(in6, 6);
3352ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in7 = _mm_srai_epi16(in7, 6);
3353ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in8 = _mm_srai_epi16(in8, 6);
3354ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in9 = _mm_srai_epi16(in9, 6);
3355ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in10 = _mm_srai_epi16(in10, 6);
3356ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in11 = _mm_srai_epi16(in11, 6);
3357ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in12 = _mm_srai_epi16(in12, 6);
3358ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in13 = _mm_srai_epi16(in13, 6);
3359ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in14 = _mm_srai_epi16(in14, 6);
3360ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in15 = _mm_srai_epi16(in15, 6);
3361ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in16 = _mm_srai_epi16(in16, 6);
3362ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in17 = _mm_srai_epi16(in17, 6);
3363ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in18 = _mm_srai_epi16(in18, 6);
3364ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in19 = _mm_srai_epi16(in19, 6);
3365ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in20 = _mm_srai_epi16(in20, 6);
3366ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in21 = _mm_srai_epi16(in21, 6);
3367ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in22 = _mm_srai_epi16(in22, 6);
3368ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in23 = _mm_srai_epi16(in23, 6);
3369ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in24 = _mm_srai_epi16(in24, 6);
3370ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in25 = _mm_srai_epi16(in25, 6);
3371ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in26 = _mm_srai_epi16(in26, 6);
3372ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in27 = _mm_srai_epi16(in27, 6);
3373ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in28 = _mm_srai_epi16(in28, 6);
3374ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in29 = _mm_srai_epi16(in29, 6);
3375ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in30 = _mm_srai_epi16(in30, 6);
3376ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in31 = _mm_srai_epi16(in31, 6);
3377ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3378ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in0);
3379ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in1);
3380ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in2);
3381ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in3);
3382ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in4);
3383ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in5);
3384ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in6);
3385ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in7);
3386ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in8);
3387ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in9);
3388ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in10);
3389ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in11);
3390ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in12);
3391ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in13);
3392ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in14);
3393ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in15);
3394ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in16);
3395ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in17);
3396ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in18);
3397ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in19);
3398ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in20);
3399ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in21);
3400ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in22);
3401ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in23);
3402ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in24);
3403ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in25);
3404ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in26);
3405ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in27);
3406ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in28);
3407ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in29);
3408ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in30);
3409ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      RECON_AND_STORE(dest, in31);
3410ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3411ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      dest += 8 - (stride * 32);
3412ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    }
3413ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
3414ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
3415