1da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/*
2da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *
4da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  Use of this source code is governed by a BSD-style license
5da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  that can be found in the LICENSE file in the root of the source
6da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  tree. An additional intellectual property rights grant can be found
7da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  in the file PATENTS.  All contributing project authors may
8da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  be found in the AUTHORS file in the root of the source tree.
9da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian */
10da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
11da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "./vpx_dsp_rtcd.h"
12da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_dsp/x86/inv_txfm_sse2.h"
13da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_dsp/x86/txfm_common_sse2.h"
14da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
15da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define RECON_AND_STORE4X4(dest, in_x) \
16da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{                                                     \
17da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
18da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  d0 = _mm_unpacklo_epi8(d0, zero); \
19da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  d0 = _mm_add_epi16(in_x, d0); \
20da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  d0 = _mm_packus_epi16(d0, d0); \
21da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *(int *)(dest) = _mm_cvtsi128_si32(d0); \
22da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
23da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
242263fc984bdc858ee931d3e35c87c404de923950Johannvoid vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
252263fc984bdc858ee931d3e35c87c404de923950Johann                             int stride) {
26da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i zero = _mm_setzero_si128();
27da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i eight = _mm_set1_epi16(8);
28da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i cst = _mm_setr_epi16(
29da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
30da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
31da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      (int16_t)cospi_8_64, (int16_t)cospi_24_64);
32da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
33da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i input0, input1, input2, input3;
34da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
35da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Rows
362263fc984bdc858ee931d3e35c87c404de923950Johann  input0 = load_input_data(input);
372263fc984bdc858ee931d3e35c87c404de923950Johann  input2 = load_input_data(input + 8);
38da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
39da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Construct i3, i1, i3, i1, i2, i0, i2, i0
40da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input0 = _mm_shufflelo_epi16(input0, 0xd8);
41da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input0 = _mm_shufflehi_epi16(input0, 0xd8);
42da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input2 = _mm_shufflelo_epi16(input2, 0xd8);
43da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input2 = _mm_shufflehi_epi16(input2, 0xd8);
44da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
45da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input1 = _mm_unpackhi_epi32(input0, input0);
46da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input0 = _mm_unpacklo_epi32(input0, input0);
47da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input3 = _mm_unpackhi_epi32(input2, input2);
48da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input2 = _mm_unpacklo_epi32(input2, input2);
49da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
50da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Stage 1
51da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input0 = _mm_madd_epi16(input0, cst);
52da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input1 = _mm_madd_epi16(input1, cst);
53da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input2 = _mm_madd_epi16(input2, cst);
54da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input3 = _mm_madd_epi16(input3, cst);
55da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
56da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input0 = _mm_add_epi32(input0, rounding);
57da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input1 = _mm_add_epi32(input1, rounding);
58da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input2 = _mm_add_epi32(input2, rounding);
59da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input3 = _mm_add_epi32(input3, rounding);
60da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
61da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
62da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
63da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
64da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
65da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
66da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Stage 2
67da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input0 = _mm_packs_epi32(input0, input1);
68da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input1 = _mm_packs_epi32(input2, input3);
69da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
70da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Transpose
71da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input2 = _mm_unpacklo_epi16(input0, input1);
72da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input3 = _mm_unpackhi_epi16(input0, input1);
73da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input0 = _mm_unpacklo_epi32(input2, input3);
74da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input1 = _mm_unpackhi_epi32(input2, input3);
75da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
76da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Switch column2, column 3, and then, we got:
77da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // input2: column1, column 0;  input3: column2, column 3.
78da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input1 = _mm_shuffle_epi32(input1, 0x4e);
79da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input2 = _mm_add_epi16(input0, input1);
80da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input3 = _mm_sub_epi16(input0, input1);
81da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
82da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Columns
83da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Construct i3, i1, i3, i1, i2, i0, i2, i0
84da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input0 = _mm_unpacklo_epi32(input2, input2);
85da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input1 = _mm_unpackhi_epi32(input2, input2);
86da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input2 = _mm_unpackhi_epi32(input3, input3);
87da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input3 = _mm_unpacklo_epi32(input3, input3);
88da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
89da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Stage 1
90da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input0 = _mm_madd_epi16(input0, cst);
91da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input1 = _mm_madd_epi16(input1, cst);
92da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input2 = _mm_madd_epi16(input2, cst);
93da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input3 = _mm_madd_epi16(input3, cst);
94da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
95da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input0 = _mm_add_epi32(input0, rounding);
96da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input1 = _mm_add_epi32(input1, rounding);
97da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input2 = _mm_add_epi32(input2, rounding);
98da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input3 = _mm_add_epi32(input3, rounding);
99da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
100da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
101da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
102da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
103da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
104da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
105da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Stage 2
106da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input0 = _mm_packs_epi32(input0, input2);
107da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input1 = _mm_packs_epi32(input1, input3);
108da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
109da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Transpose
110da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input2 = _mm_unpacklo_epi16(input0, input1);
111da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input3 = _mm_unpackhi_epi16(input0, input1);
112da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input0 = _mm_unpacklo_epi32(input2, input3);
113da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input1 = _mm_unpackhi_epi32(input2, input3);
114da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
115da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Switch column2, column 3, and then, we got:
116da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // input2: column1, column 0;  input3: column2, column 3.
117da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input1 = _mm_shuffle_epi32(input1, 0x4e);
118da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input2 = _mm_add_epi16(input0, input1);
119da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input3 = _mm_sub_epi16(input0, input1);
120da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
121da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Final round and shift
122da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input2 = _mm_add_epi16(input2, eight);
123da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input3 = _mm_add_epi16(input3, eight);
124da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
125da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input2 = _mm_srai_epi16(input2, 4);
126da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  input3 = _mm_srai_epi16(input3, 4);
127da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
128da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Reconstruction and Store
129da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  {
130da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
131da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
132da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d0 = _mm_unpacklo_epi32(d0,
133da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
134da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d2 = _mm_unpacklo_epi32(
135da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
136da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d0 = _mm_unpacklo_epi8(d0, zero);
137da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d2 = _mm_unpacklo_epi8(d2, zero);
138da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d0 = _mm_add_epi16(d0, input2);
139da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d2 = _mm_add_epi16(d2, input3);
140da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d0 = _mm_packus_epi16(d0, d2);
141da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // store input0
142da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    *(int *)dest = _mm_cvtsi128_si32(d0);
143da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // store input1
144da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d0 = _mm_srli_si128(d0, 4);
145da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
146da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // store input2
147da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d0 = _mm_srli_si128(d0, 4);
148da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
149da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // store input3
150da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d0 = _mm_srli_si128(d0, 4);
151da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
152da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
153da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
154da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1552263fc984bdc858ee931d3e35c87c404de923950Johannvoid vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
1562263fc984bdc858ee931d3e35c87c404de923950Johann                            int stride) {
157da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i dc_value;
158da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i zero = _mm_setzero_si128();
159da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int a;
160da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
161da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  a = dct_const_round_shift(input[0] * cospi_16_64);
162da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  a = dct_const_round_shift(a * cospi_16_64);
163da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  a = ROUND_POWER_OF_TWO(a, 4);
164da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
165da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  dc_value = _mm_set1_epi16(a);
166da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
167da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
168da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
169da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
170da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
171da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
172da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
173da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic INLINE void transpose_4x4(__m128i *res) {
174da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
175da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
176da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
177da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
178da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
179da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
180da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
181da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid idct4_sse2(__m128i *in) {
182da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
183da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
184da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
185da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
186da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
187da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i u[8], v[8];
188da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
189da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  transpose_4x4(in);
190da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // stage 1
191da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
192da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
193da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
194da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
195da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
196da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
197da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
198da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
199da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
200da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
201da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
202da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
203da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
204da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
205da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
206da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
207da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
208da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_packs_epi32(v[0], v[1]);
209da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_packs_epi32(v[3], v[2]);
210da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
211da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // stage 2
212da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[0] = _mm_add_epi16(u[0], u[1]);
213da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[1] = _mm_sub_epi16(u[0], u[1]);
214da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[1] = _mm_shuffle_epi32(in[1], 0x4E);
215da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
216da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
217da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid iadst4_sse2(__m128i *in) {
218da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
219da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
220da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
221da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
222da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
223da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i kZero = _mm_set1_epi16(0);
224da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
225da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i u[8], v[8], in7;
226da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
227da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  transpose_4x4(in);
228da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in7 = _mm_srli_si128(in[1], 8);
229da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in7 = _mm_add_epi16(in7, in[0]);
230da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in7 = _mm_sub_epi16(in7, in[1]);
231da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
232da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
233da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
234da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[2] = _mm_unpacklo_epi16(in7, kZero);
235da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[3] = _mm_unpackhi_epi16(in[0], kZero);
236da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
237da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
238da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
239da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
240da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
241da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
242da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
243da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
244da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_add_epi32(v[0], v[1]);
245da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_add_epi32(v[3], v[4]);
246da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[2] = v[2];
247da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[3] = _mm_add_epi32(u[0], u[1]);
248da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[4] = _mm_slli_epi32(v[5], 2);
249da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[5] = _mm_add_epi32(u[3], v[5]);
250da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[6] = _mm_sub_epi32(u[5], u[4]);
251da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
252da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
253da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
254da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
255da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
256da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
257da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
258da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
259da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
260da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
261da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
262da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[0] = _mm_packs_epi32(u[0], u[1]);
263da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[1] = _mm_packs_epi32(u[2], u[3]);
264da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
265da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
266da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
267da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                      out0, out1, out2, out3, out4, out5, out6, out7) \
268da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  {                                                     \
269da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
270da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
271da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
272da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
273da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
274da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
275da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
276da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
277da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                                        \
278da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
279da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
280da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
281da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
282da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
283da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
284da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
285da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
286da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                                            \
287da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
288da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
289da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
290da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
291da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
292da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
293da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
294da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
295da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
296da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
297da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \
298da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         out0, out1, out2, out3) \
299da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  {                                              \
300da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
301da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
302da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
303da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
304da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
305da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
306da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
307da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
308da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
309da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
310da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
311da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
312da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
313da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
314da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
315da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
316da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
317da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  {                                            \
318da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
319da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
320da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
321da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
322da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
323da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
324da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian// Define Macro for multiplying elements by constants and adding them together.
325da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
326da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                               cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
327da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  {   \
328da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp0 = _mm_madd_epi16(lo_0, cst0); \
329da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp1 = _mm_madd_epi16(hi_0, cst0); \
330da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp2 = _mm_madd_epi16(lo_0, cst1); \
331da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp3 = _mm_madd_epi16(hi_0, cst1); \
332da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp4 = _mm_madd_epi16(lo_1, cst2); \
333da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp5 = _mm_madd_epi16(hi_1, cst2); \
334da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp6 = _mm_madd_epi16(lo_1, cst3); \
335da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp7 = _mm_madd_epi16(hi_1, cst3); \
336da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      \
337da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp0 = _mm_add_epi32(tmp0, rounding); \
338da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp1 = _mm_add_epi32(tmp1, rounding); \
339da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp2 = _mm_add_epi32(tmp2, rounding); \
340da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp3 = _mm_add_epi32(tmp3, rounding); \
341da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp4 = _mm_add_epi32(tmp4, rounding); \
342da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp5 = _mm_add_epi32(tmp5, rounding); \
343da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp6 = _mm_add_epi32(tmp6, rounding); \
344da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp7 = _mm_add_epi32(tmp7, rounding); \
345da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      \
346da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
347da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
348da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
349da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
350da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
351da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
352da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
353da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
354da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      \
355da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      res0 = _mm_packs_epi32(tmp0, tmp1); \
356da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      res1 = _mm_packs_epi32(tmp2, tmp3); \
357da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      res2 = _mm_packs_epi32(tmp4, tmp5); \
358da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      res3 = _mm_packs_epi32(tmp6, tmp7); \
359da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
360da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
361da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
362da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  {   \
363da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp0 = _mm_madd_epi16(lo_0, cst0); \
364da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp1 = _mm_madd_epi16(hi_0, cst0); \
365da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp2 = _mm_madd_epi16(lo_0, cst1); \
366da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp3 = _mm_madd_epi16(hi_0, cst1); \
367da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      \
368da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp0 = _mm_add_epi32(tmp0, rounding); \
369da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp1 = _mm_add_epi32(tmp1, rounding); \
370da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp2 = _mm_add_epi32(tmp2, rounding); \
371da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp3 = _mm_add_epi32(tmp3, rounding); \
372da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      \
373da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
374da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
375da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
376da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
377da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      \
378da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      res0 = _mm_packs_epi32(tmp0, tmp1); \
379da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      res1 = _mm_packs_epi32(tmp2, tmp3); \
380da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
381da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
382da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
383da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian              out0, out1, out2, out3, out4, out5, out6, out7)  \
384da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  { \
385da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* Stage1 */      \
386da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  { \
387da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
388da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
389da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
390da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
391da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
392da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
393da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                          stg1_1, stg1_2, stg1_3, stp1_4,      \
394da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                          stp1_7, stp1_5, stp1_6)              \
395da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } \
396da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
397da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* Stage2 */ \
398da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  { \
399da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
400da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
401da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
402da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
403da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
404da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
405da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                           stg2_1, stg2_2, stg2_3, stp2_0,     \
406da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                           stp2_1, stp2_2, stp2_3)             \
407da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
408da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
409da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
410da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
411da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
412da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } \
413da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
414da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* Stage3 */ \
415da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  { \
416da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
417da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
418da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
419da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
420da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
421da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
422da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
423da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
424da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
425da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
426da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
427da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
428da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
429da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = _mm_add_epi32(tmp0, rounding); \
430da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp1 = _mm_add_epi32(tmp1, rounding); \
431da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = _mm_add_epi32(tmp2, rounding); \
432da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp3 = _mm_add_epi32(tmp3, rounding); \
433da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
434da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
435da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
436da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
437da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
438da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
439da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
440da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
441da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } \
442da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
443da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* Stage4  */ \
444da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  out0 = _mm_adds_epi16(stp1_0, stp2_7); \
445da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  out1 = _mm_adds_epi16(stp1_1, stp1_6); \
446da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  out2 = _mm_adds_epi16(stp1_2, stp1_5); \
447da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  out3 = _mm_adds_epi16(stp1_3, stp2_4); \
448da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  out4 = _mm_subs_epi16(stp1_3, stp2_4); \
449da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  out5 = _mm_subs_epi16(stp1_2, stp1_5); \
450da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  out6 = _mm_subs_epi16(stp1_1, stp1_6); \
451da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  out7 = _mm_subs_epi16(stp1_0, stp2_7); \
452da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
453da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
4542263fc984bdc858ee931d3e35c87c404de923950Johannvoid vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
4552263fc984bdc858ee931d3e35c87c404de923950Johann                             int stride) {
456da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i zero = _mm_setzero_si128();
457da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
458da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
459da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
460da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
461da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
462da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
463da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
464da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
465da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
466da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
467da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
468da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
469da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
470da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
471da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
472da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int i;
473da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
474da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Load input data.
4752263fc984bdc858ee931d3e35c87c404de923950Johann  in0 = load_input_data(input);
4762263fc984bdc858ee931d3e35c87c404de923950Johann  in1 = load_input_data(input + 8 * 1);
4772263fc984bdc858ee931d3e35c87c404de923950Johann  in2 = load_input_data(input + 8 * 2);
4782263fc984bdc858ee931d3e35c87c404de923950Johann  in3 = load_input_data(input + 8 * 3);
4792263fc984bdc858ee931d3e35c87c404de923950Johann  in4 = load_input_data(input + 8 * 4);
4802263fc984bdc858ee931d3e35c87c404de923950Johann  in5 = load_input_data(input + 8 * 5);
4812263fc984bdc858ee931d3e35c87c404de923950Johann  in6 = load_input_data(input + 8 * 6);
4822263fc984bdc858ee931d3e35c87c404de923950Johann  in7 = load_input_data(input + 8 * 7);
483da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
484da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // 2-D
485da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (i = 0; i < 2; i++) {
486da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
487da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,
488da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                  in0, in1, in2, in3, in4, in5, in6, in7);
489da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
490da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // 4-stage 1D idct8x8
491da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
492da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          in0, in1, in2, in3, in4, in5, in6, in7);
493da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
494da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
495da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Final rounding and shift
496da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in0 = _mm_adds_epi16(in0, final_rounding);
497da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in1 = _mm_adds_epi16(in1, final_rounding);
498da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in2 = _mm_adds_epi16(in2, final_rounding);
499da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in3 = _mm_adds_epi16(in3, final_rounding);
500da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in4 = _mm_adds_epi16(in4, final_rounding);
501da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in5 = _mm_adds_epi16(in5, final_rounding);
502da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in6 = _mm_adds_epi16(in6, final_rounding);
503da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in7 = _mm_adds_epi16(in7, final_rounding);
504da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
505da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in0 = _mm_srai_epi16(in0, 5);
506da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in1 = _mm_srai_epi16(in1, 5);
507da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in2 = _mm_srai_epi16(in2, 5);
508da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in3 = _mm_srai_epi16(in3, 5);
509da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in4 = _mm_srai_epi16(in4, 5);
510da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in5 = _mm_srai_epi16(in5, 5);
511da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in6 = _mm_srai_epi16(in6, 5);
512da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in7 = _mm_srai_epi16(in7, 5);
513da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
514da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  RECON_AND_STORE(dest + 0 * stride, in0);
515da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  RECON_AND_STORE(dest + 1 * stride, in1);
516da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  RECON_AND_STORE(dest + 2 * stride, in2);
517da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  RECON_AND_STORE(dest + 3 * stride, in3);
518da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  RECON_AND_STORE(dest + 4 * stride, in4);
519da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  RECON_AND_STORE(dest + 5 * stride, in5);
520da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  RECON_AND_STORE(dest + 6 * stride, in6);
521da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  RECON_AND_STORE(dest + 7 * stride, in7);
522da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
523da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
5242263fc984bdc858ee931d3e35c87c404de923950Johannvoid vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
5252263fc984bdc858ee931d3e35c87c404de923950Johann                            int stride) {
526da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i dc_value;
527da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i zero = _mm_setzero_si128();
528da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int a;
529da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
530da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  a = dct_const_round_shift(input[0] * cospi_16_64);
531da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  a = dct_const_round_shift(a * cospi_16_64);
532da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  a = ROUND_POWER_OF_TWO(a, 5);
533da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
534da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  dc_value = _mm_set1_epi16(a);
535da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
536da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  RECON_AND_STORE(dest + 0 * stride, dc_value);
537da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  RECON_AND_STORE(dest + 1 * stride, dc_value);
538da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  RECON_AND_STORE(dest + 2 * stride, dc_value);
539da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  RECON_AND_STORE(dest + 3 * stride, dc_value);
540da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  RECON_AND_STORE(dest + 4 * stride, dc_value);
541da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  RECON_AND_STORE(dest + 5 * stride, dc_value);
542da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  RECON_AND_STORE(dest + 6 * stride, dc_value);
543da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  RECON_AND_STORE(dest + 7 * stride, dc_value);
544da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
545da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
546da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid idct8_sse2(__m128i *in) {
547da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
548da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
549da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
550da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
551da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
552da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
553da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
554da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
555da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
556da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
557da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
558da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
559da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
560da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
561da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
562da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
563da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
564da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                in0, in1, in2, in3, in4, in5, in6, in7);
565da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
566da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // 4-stage 1D idct8x8
567da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
568da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
569da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
570da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
571da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid iadst8_sse2(__m128i *in) {
572da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
573da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
574da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
575da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
576da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
577da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
578da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
579da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
580da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
581da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
582da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
583da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
584da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
585da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__const_0 = _mm_set1_epi16(0);
586da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
587da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
588da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
589da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
590da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
591da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
592da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
593da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
594da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // transpose
595da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  array_transpose_8x8(in, in);
596da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
597da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // properly aligned for butterfly input
598da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in0 = in[7];
599da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in1 = in[0];
600da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in2 = in[5];
601da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in3 = in[2];
602da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in4 = in[3];
603da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in5 = in[4];
604da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in6 = in[1];
605da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in7 = in[6];
606da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
607da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // column transformation
608da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // stage 1
609da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // interleave and multiply/add into 32-bit integer
610da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s0 = _mm_unpacklo_epi16(in0, in1);
611da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s1 = _mm_unpackhi_epi16(in0, in1);
612da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s2 = _mm_unpacklo_epi16(in2, in3);
613da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s3 = _mm_unpackhi_epi16(in2, in3);
614da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s4 = _mm_unpacklo_epi16(in4, in5);
615da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s5 = _mm_unpackhi_epi16(in4, in5);
616da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s6 = _mm_unpacklo_epi16(in6, in7);
617da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s7 = _mm_unpackhi_epi16(in6, in7);
618da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
619da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
620da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
621da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
622da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
623da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
624da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
625da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
626da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
627da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
628da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
629da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
630da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
631da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
632da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
633da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
634da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
635da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
636da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // addition
637da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  w0 = _mm_add_epi32(u0, u8);
638da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  w1 = _mm_add_epi32(u1, u9);
639da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  w2 = _mm_add_epi32(u2, u10);
640da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  w3 = _mm_add_epi32(u3, u11);
641da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  w4 = _mm_add_epi32(u4, u12);
642da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  w5 = _mm_add_epi32(u5, u13);
643da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  w6 = _mm_add_epi32(u6, u14);
644da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  w7 = _mm_add_epi32(u7, u15);
645da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  w8 = _mm_sub_epi32(u0, u8);
646da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  w9 = _mm_sub_epi32(u1, u9);
647da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  w10 = _mm_sub_epi32(u2, u10);
648da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  w11 = _mm_sub_epi32(u3, u11);
649da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  w12 = _mm_sub_epi32(u4, u12);
650da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  w13 = _mm_sub_epi32(u5, u13);
651da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  w14 = _mm_sub_epi32(u6, u14);
652da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  w15 = _mm_sub_epi32(u7, u15);
653da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
654da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // shift and rounding
655da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
656da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
657da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
658da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
659da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
660da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
661da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
662da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
663da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
664da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
665da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
666da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
667da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
668da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
669da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
670da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
671da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
672da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
673da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
674da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
675da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
676da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
677da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
678da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
679da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
680da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
681da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
682da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
683da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
684da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
685da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
686da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
687da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
688da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
689da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // back to 16-bit and pack 8 integers into __m128i
690da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[0] = _mm_packs_epi32(u0, u1);
691da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[1] = _mm_packs_epi32(u2, u3);
692da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[2] = _mm_packs_epi32(u4, u5);
693da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[3] = _mm_packs_epi32(u6, u7);
694da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[4] = _mm_packs_epi32(u8, u9);
695da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[5] = _mm_packs_epi32(u10, u11);
696da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[6] = _mm_packs_epi32(u12, u13);
697da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[7] = _mm_packs_epi32(u14, u15);
698da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
699da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // stage 2
700da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s0 = _mm_add_epi16(in[0], in[2]);
701da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s1 = _mm_add_epi16(in[1], in[3]);
702da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s2 = _mm_sub_epi16(in[0], in[2]);
703da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s3 = _mm_sub_epi16(in[1], in[3]);
704da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u0 = _mm_unpacklo_epi16(in[4], in[5]);
705da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u1 = _mm_unpackhi_epi16(in[4], in[5]);
706da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u2 = _mm_unpacklo_epi16(in[6], in[7]);
707da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u3 = _mm_unpackhi_epi16(in[6], in[7]);
708da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
709da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
710da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
711da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
712da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
713da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
714da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
715da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
716da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
717da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
718da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  w0 = _mm_add_epi32(v0, v4);
719da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  w1 = _mm_add_epi32(v1, v5);
720da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  w2 = _mm_add_epi32(v2, v6);
721da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  w3 = _mm_add_epi32(v3, v7);
722da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  w4 = _mm_sub_epi32(v0, v4);
723da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  w5 = _mm_sub_epi32(v1, v5);
724da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  w6 = _mm_sub_epi32(v2, v6);
725da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  w7 = _mm_sub_epi32(v3, v7);
726da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
727da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
728da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
729da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
730da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
731da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
732da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
733da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
734da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
735da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
736da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
737da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
738da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
739da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
740da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
741da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
742da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
743da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
744da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
745da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // back to 16-bit intergers
746da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s4 = _mm_packs_epi32(u0, u1);
747da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s5 = _mm_packs_epi32(u2, u3);
748da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s6 = _mm_packs_epi32(u4, u5);
749da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s7 = _mm_packs_epi32(u6, u7);
750da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
751da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // stage 3
752da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u0 = _mm_unpacklo_epi16(s2, s3);
753da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u1 = _mm_unpackhi_epi16(s2, s3);
754da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u2 = _mm_unpacklo_epi16(s6, s7);
755da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u3 = _mm_unpackhi_epi16(s6, s7);
756da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
757da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
758da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
759da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
760da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
761da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
762da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
763da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
764da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
765da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
766da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
767da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
768da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
769da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
770da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
771da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
772da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
773da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
774da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
775da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
776da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
777da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
778da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
779da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
780da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
781da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
782da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
783da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
784da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s2 = _mm_packs_epi32(v0, v1);
785da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s3 = _mm_packs_epi32(v2, v3);
786da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s6 = _mm_packs_epi32(v4, v5);
787da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s7 = _mm_packs_epi32(v6, v7);
788da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
789da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[0] = s0;
790da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[1] = _mm_sub_epi16(k__const_0, s4);
791da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[2] = s6;
792da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[3] = _mm_sub_epi16(k__const_0, s2);
793da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[4] = s3;
794da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[5] = _mm_sub_epi16(k__const_0, s7);
795da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[6] = s5;
796da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[7] = _mm_sub_epi16(k__const_0, s1);
797da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
798da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
7992263fc984bdc858ee931d3e35c87c404de923950Johannvoid vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
8002263fc984bdc858ee931d3e35c87c404de923950Johann                             int stride) {
801da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i zero = _mm_setzero_si128();
802da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
803da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
804da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
805da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
806da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
807da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
808da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
809da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
810da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
811da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
812da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
813da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
814da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
815da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
816da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
817da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
818da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
819da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Rows. Load 4-row input data.
8202263fc984bdc858ee931d3e35c87c404de923950Johann  in0 = load_input_data(input);
8212263fc984bdc858ee931d3e35c87c404de923950Johann  in1 = load_input_data(input + 8 * 1);
8222263fc984bdc858ee931d3e35c87c404de923950Johann  in2 = load_input_data(input + 8 * 2);
8232263fc984bdc858ee931d3e35c87c404de923950Johann  in3 = load_input_data(input + 8 * 3);
824da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
825da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // 8x4 Transpose
826da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
827da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Stage1
828da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  {
829da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
830da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
831da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
832da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = _mm_madd_epi16(lo_17, stg1_0);
833da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = _mm_madd_epi16(lo_17, stg1_1);
834da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp4 = _mm_madd_epi16(lo_35, stg1_2);
835da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp6 = _mm_madd_epi16(lo_35, stg1_3);
836da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
837da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = _mm_add_epi32(tmp0, rounding);
838da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = _mm_add_epi32(tmp2, rounding);
839da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp4 = _mm_add_epi32(tmp4, rounding);
840da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp6 = _mm_add_epi32(tmp6, rounding);
841da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
842da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
843da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
844da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
845da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
846da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
847da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_5 = _mm_packs_epi32(tmp4, tmp6);
848da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
849da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
850da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Stage2
851da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  {
852da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
853da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
854da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
855da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = _mm_madd_epi16(lo_04, stg2_0);
856da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = _mm_madd_epi16(lo_04, stg2_1);
857da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp4 = _mm_madd_epi16(lo_26, stg2_2);
858da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp6 = _mm_madd_epi16(lo_26, stg2_3);
859da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
860da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = _mm_add_epi32(tmp0, rounding);
861da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = _mm_add_epi32(tmp2, rounding);
862da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp4 = _mm_add_epi32(tmp4, rounding);
863da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp6 = _mm_add_epi32(tmp6, rounding);
864da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
865da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
866da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
867da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
868da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
869da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_0 = _mm_packs_epi32(tmp0, tmp2);
870da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_2 = _mm_packs_epi32(tmp6, tmp4);
871da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
872da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
873da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
874da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
875da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_4 = tmp0;
876da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
877da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
878da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
879da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
880da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Stage3
881da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  {
882da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
883da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
884da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
885da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
886da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
887da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
888da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
889da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
890da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = _mm_madd_epi16(lo_56, stg3_0);
891da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
892da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
893da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = _mm_add_epi32(tmp0, rounding);
894da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = _mm_add_epi32(tmp2, rounding);
895da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
896da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
897da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
898da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_5 = _mm_packs_epi32(tmp0, tmp2);
899da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
900da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
901da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Stage4
902da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
903da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
904da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
905da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
906da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
907da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
908da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
909da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
910da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        in0, in1, in2, in3, in4, in5, in6, in7);
911da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Final rounding and shift
912da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in0 = _mm_adds_epi16(in0, final_rounding);
913da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in1 = _mm_adds_epi16(in1, final_rounding);
914da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in2 = _mm_adds_epi16(in2, final_rounding);
915da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in3 = _mm_adds_epi16(in3, final_rounding);
916da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in4 = _mm_adds_epi16(in4, final_rounding);
917da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in5 = _mm_adds_epi16(in5, final_rounding);
918da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in6 = _mm_adds_epi16(in6, final_rounding);
919da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in7 = _mm_adds_epi16(in7, final_rounding);
920da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
921da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in0 = _mm_srai_epi16(in0, 5);
922da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in1 = _mm_srai_epi16(in1, 5);
923da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in2 = _mm_srai_epi16(in2, 5);
924da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in3 = _mm_srai_epi16(in3, 5);
925da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in4 = _mm_srai_epi16(in4, 5);
926da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in5 = _mm_srai_epi16(in5, 5);
927da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in6 = _mm_srai_epi16(in6, 5);
928da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in7 = _mm_srai_epi16(in7, 5);
929da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
930da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  RECON_AND_STORE(dest + 0 * stride, in0);
931da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  RECON_AND_STORE(dest + 1 * stride, in1);
932da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  RECON_AND_STORE(dest + 2 * stride, in2);
933da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  RECON_AND_STORE(dest + 3 * stride, in3);
934da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  RECON_AND_STORE(dest + 4 * stride, in4);
935da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  RECON_AND_STORE(dest + 5 * stride, in5);
936da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  RECON_AND_STORE(dest + 6 * stride, in6);
937da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  RECON_AND_STORE(dest + 7 * stride, in7);
938da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
939da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
940da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define IDCT16 \
941da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* Stage2 */ \
942da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  { \
943da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
944da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
945da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);   \
946da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);   \
947da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
948da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
949da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
950da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
951da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
952da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
953da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                           stg2_0, stg2_1, stg2_2, stg2_3, \
954da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                           stp2_8, stp2_15, stp2_9, stp2_14) \
955da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
956da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
957da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                           stg2_4, stg2_5, stg2_6, stg2_7, \
958da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                           stp2_10, stp2_13, stp2_11, stp2_12) \
959da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } \
960da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
961da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* Stage3 */ \
962da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  { \
963da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
964da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
965da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
966da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
967da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
968da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
969da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                           stg3_0, stg3_1, stg3_2, stg3_3, \
970da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                           stp1_4, stp1_7, stp1_5, stp1_6) \
971da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
972da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);  \
973da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);    \
974da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
975da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
976da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
977da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
978da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
979da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
980da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
981da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } \
982da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
983da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* Stage4 */ \
984da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  { \
985da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
986da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
987da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
988da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
989da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
990da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
991da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
992da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
993da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
994da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
995da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
996da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                           stg4_0, stg4_1, stg4_2, stg4_3, \
997da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                           stp2_0, stp2_1, stp2_2, stp2_3) \
998da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
999da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
1000da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
1001da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
1002da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
1003da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
1004da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
1005da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                           stg4_4, stg4_5, stg4_6, stg4_7, \
1006da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                           stp2_9, stp2_14, stp2_10, stp2_13) \
1007da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } \
1008da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
1009da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* Stage5 */ \
1010da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  { \
1011da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
1012da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
1013da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
1014da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
1015da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
1016da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
1017da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
1018da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
1019da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
1020da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
1021da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
1022da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
1023da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
1024da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = _mm_add_epi32(tmp0, rounding); \
1025da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp1 = _mm_add_epi32(tmp1, rounding); \
1026da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = _mm_add_epi32(tmp2, rounding); \
1027da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp3 = _mm_add_epi32(tmp3, rounding); \
1028da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
1029da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
1030da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
1031da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
1032da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
1033da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
1034da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
1035da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
1036da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
1037da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
1038da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
1039da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
1040da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
1041da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
1042da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
1043da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
1044da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
1045da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
1046da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } \
1047da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
1048da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* Stage6 */ \
1049da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  { \
1050da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1051da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1052da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
1053da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
1054da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
1055da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
1056da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
1057da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
1058da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
1059da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
1060da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1061da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1062da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1063da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
1064da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
1065da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                           stg6_0, stg4_0, stg6_0, stg4_0, \
1066da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                           stp2_10, stp2_13, stp2_11, stp2_12) \
1067da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
1068da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1069da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define IDCT16_10 \
1070da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    /* Stage2 */ \
1071da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    { \
1072da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
1073da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
1074da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
1075da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
1076da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      \
1077da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \
1078da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                             stg2_0, stg2_1, stg2_6, stg2_7, \
1079da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                             stp1_8_0, stp1_15, stp1_11, stp1_12_0) \
1080da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    } \
1081da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      \
1082da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    /* Stage3 */ \
1083da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    { \
1084da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
1085da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
1086da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      \
1087da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \
1088da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                               stg3_0, stg3_1,  \
1089da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                               stp2_4, stp2_7) \
1090da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      \
1091da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      stp1_9  =  stp1_8_0; \
1092da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      stp1_10 =  stp1_11;  \
1093da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      \
1094da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      stp1_13 = stp1_12_0; \
1095da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      stp1_14 = stp1_15;   \
1096da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    } \
1097da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    \
1098da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    /* Stage4 */ \
1099da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    { \
1100da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
1101da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
1102da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      \
1103da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
1104da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
1105da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1106da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1107da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      \
1108da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \
1109da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                               stg4_0, stg4_1, \
1110da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                               stp1_0, stp1_1) \
1111da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      stp2_5 = stp2_4; \
1112da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      stp2_6 = stp2_7; \
1113da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      \
1114da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
1115da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                             stg4_4, stg4_5, stg4_6, stg4_7, \
1116da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                             stp2_9, stp2_14, stp2_10, stp2_13) \
1117da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    } \
1118da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      \
1119da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    /* Stage5 */ \
1120da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    { \
1121da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
1122da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
1123da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      \
1124da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      stp1_2 = stp1_1; \
1125da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      stp1_3 = stp1_0; \
1126da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      \
1127da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
1128da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
1129da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
1130da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
1131da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      \
1132da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp0 = _mm_add_epi32(tmp0, rounding); \
1133da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp1 = _mm_add_epi32(tmp1, rounding); \
1134da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp2 = _mm_add_epi32(tmp2, rounding); \
1135da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp3 = _mm_add_epi32(tmp3, rounding); \
1136da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      \
1137da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
1138da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
1139da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
1140da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
1141da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      \
1142da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
1143da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
1144da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      \
1145da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
1146da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
1147da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
1148da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
1149da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      \
1150da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
1151da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
1152da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
1153da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
1154da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    } \
1155da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      \
1156da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    /* Stage6 */ \
1157da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    { \
1158da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1159da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1160da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
1161da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
1162da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      \
1163da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
1164da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
1165da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
1166da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
1167da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
1168da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1169da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1170da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1171da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      \
1172da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
1173da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                             stg6_0, stg4_0, stg6_0, stg4_0, \
1174da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                             stp2_10, stp2_13, stp2_11, stp2_12) \
1175da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
1176da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
11772263fc984bdc858ee931d3e35c87c404de923950Johannvoid vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
1178da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                int stride) {
1179da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
1180da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
1181da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i zero = _mm_setzero_si128();
1182da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1183da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1184da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
1185da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1186da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
1187da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1188da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
1189da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1190da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
1191da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1192da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1193da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
1194da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1195da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
1196da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1197da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
1198da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1199da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1200da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
1201da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1202da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
1203da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1204da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1205da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1206da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1207da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1208da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i in[16], l[16], r[16], *curr1;
1209da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
1210da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
1211da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          stp1_8_0, stp1_12_0;
1212da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
1213da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
1214da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1215da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int i;
1216da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1217da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  curr1 = l;
1218da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (i = 0; i < 2; i++) {
1219da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // 1-D idct
1220da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1221da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Load input data.
12222263fc984bdc858ee931d3e35c87c404de923950Johann    in[0] = load_input_data(input);
12232263fc984bdc858ee931d3e35c87c404de923950Johann    in[8] = load_input_data(input + 8 * 1);
12242263fc984bdc858ee931d3e35c87c404de923950Johann    in[1] = load_input_data(input + 8 * 2);
12252263fc984bdc858ee931d3e35c87c404de923950Johann    in[9] = load_input_data(input + 8 * 3);
12262263fc984bdc858ee931d3e35c87c404de923950Johann    in[2] = load_input_data(input + 8 * 4);
12272263fc984bdc858ee931d3e35c87c404de923950Johann    in[10] = load_input_data(input + 8 * 5);
12282263fc984bdc858ee931d3e35c87c404de923950Johann    in[3] = load_input_data(input + 8 * 6);
12292263fc984bdc858ee931d3e35c87c404de923950Johann    in[11] = load_input_data(input + 8 * 7);
12302263fc984bdc858ee931d3e35c87c404de923950Johann    in[4] = load_input_data(input + 8 * 8);
12312263fc984bdc858ee931d3e35c87c404de923950Johann    in[12] = load_input_data(input + 8 * 9);
12322263fc984bdc858ee931d3e35c87c404de923950Johann    in[5] = load_input_data(input + 8 * 10);
12332263fc984bdc858ee931d3e35c87c404de923950Johann    in[13] = load_input_data(input + 8 * 11);
12342263fc984bdc858ee931d3e35c87c404de923950Johann    in[6] = load_input_data(input + 8 * 12);
12352263fc984bdc858ee931d3e35c87c404de923950Johann    in[14] = load_input_data(input + 8 * 13);
12362263fc984bdc858ee931d3e35c87c404de923950Johann    in[7] = load_input_data(input + 8 * 14);
12372263fc984bdc858ee931d3e35c87c404de923950Johann    in[15] = load_input_data(input + 8 * 15);
1238da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1239da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    array_transpose_8x8(in, in);
1240da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    array_transpose_8x8(in + 8, in + 8);
1241da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1242da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    IDCT16
1243da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1244da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Stage7
1245da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
1246da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
1247da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
1248da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
1249da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
1250da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
1251da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
1252da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
1253da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
1254da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
1255da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
1256da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
1257da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
1258da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
1259da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
1260da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
1261da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1262da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    curr1 = r;
1263da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    input += 128;
1264da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
1265da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (i = 0; i < 2; i++) {
1266da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    int j;
1267da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // 1-D idct
1268da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    array_transpose_8x8(l + i * 8, in);
1269da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    array_transpose_8x8(r + i * 8, in + 8);
1270da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1271da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    IDCT16
1272da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1273da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // 2-D
1274da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[0] = _mm_add_epi16(stp2_0, stp1_15);
1275da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[1] = _mm_add_epi16(stp2_1, stp1_14);
1276da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[2] = _mm_add_epi16(stp2_2, stp2_13);
1277da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[3] = _mm_add_epi16(stp2_3, stp2_12);
1278da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[4] = _mm_add_epi16(stp2_4, stp2_11);
1279da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[5] = _mm_add_epi16(stp2_5, stp2_10);
1280da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[6] = _mm_add_epi16(stp2_6, stp1_9);
1281da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[7] = _mm_add_epi16(stp2_7, stp1_8);
1282da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
1283da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
1284da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
1285da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
1286da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
1287da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
1288da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
1289da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
1290da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1291da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    for (j = 0; j < 16; ++j) {
1292da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      // Final rounding and shift
1293da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      in[j] = _mm_adds_epi16(in[j], final_rounding);
1294da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      in[j] = _mm_srai_epi16(in[j], 6);
1295da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      RECON_AND_STORE(dest + j * stride, in[j]);
1296da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
1297da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1298da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dest += 8;
1299da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
1300da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
1301da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
13022263fc984bdc858ee931d3e35c87c404de923950Johannvoid vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
13032263fc984bdc858ee931d3e35c87c404de923950Johann                              int stride) {
1304da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i dc_value;
1305da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i zero = _mm_setzero_si128();
1306da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int a, i;
1307da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1308da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  a = dct_const_round_shift(input[0] * cospi_16_64);
1309da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  a = dct_const_round_shift(a * cospi_16_64);
1310da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  a = ROUND_POWER_OF_TWO(a, 6);
1311da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1312da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  dc_value = _mm_set1_epi16(a);
1313da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1314da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (i = 0; i < 2; ++i) {
1315da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    RECON_AND_STORE(dest +  0 * stride, dc_value);
1316da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    RECON_AND_STORE(dest +  1 * stride, dc_value);
1317da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    RECON_AND_STORE(dest +  2 * stride, dc_value);
1318da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    RECON_AND_STORE(dest +  3 * stride, dc_value);
1319da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    RECON_AND_STORE(dest +  4 * stride, dc_value);
1320da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    RECON_AND_STORE(dest +  5 * stride, dc_value);
1321da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    RECON_AND_STORE(dest +  6 * stride, dc_value);
1322da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    RECON_AND_STORE(dest +  7 * stride, dc_value);
1323da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    RECON_AND_STORE(dest +  8 * stride, dc_value);
1324da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    RECON_AND_STORE(dest +  9 * stride, dc_value);
1325da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    RECON_AND_STORE(dest + 10 * stride, dc_value);
1326da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    RECON_AND_STORE(dest + 11 * stride, dc_value);
1327da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    RECON_AND_STORE(dest + 12 * stride, dc_value);
1328da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    RECON_AND_STORE(dest + 13 * stride, dc_value);
1329da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    RECON_AND_STORE(dest + 14 * stride, dc_value);
1330da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    RECON_AND_STORE(dest + 15 * stride, dc_value);
1331da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dest += 8;
1332da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
1333da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
1334da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1335da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void iadst16_8col(__m128i *in) {
1336da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // perform 16x16 1-D ADST for 8 columns
1337da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i s[16], x[16], u[32], v[32];
1338da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
1339da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
1340da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
1341da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
1342da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
1343da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
1344da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
1345da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
1346da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
1347da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
1348da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
1349da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
1350da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
1351da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
1352da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
1353da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
1354da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1355da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1356da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1357da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1358da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
1359da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
1360da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1361da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1362da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
1363da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
1364da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
1365da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1366da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1367da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1368da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i kZero = _mm_set1_epi16(0);
1369da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1370da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
1371da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
1372da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
1373da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
1374da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
1375da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
1376da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
1377da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
1378da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
1379da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
1380da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
1381da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
1382da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
1383da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
1384da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
1385da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
1386da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1387da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
1388da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
1389da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
1390da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
1391da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
1392da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
1393da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
1394da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
1395da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
1396da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
1397da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
1398da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
1399da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
1400da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
1401da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
1402da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
1403da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
1404da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
1405da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
1406da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
1407da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
1408da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
1409da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
1410da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
1411da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
1412da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
1413da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
1414da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
1415da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
1416da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
1417da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
1418da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
1419da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1420da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_add_epi32(v[0], v[16]);
1421da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_add_epi32(v[1], v[17]);
1422da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[2] = _mm_add_epi32(v[2], v[18]);
1423da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[3] = _mm_add_epi32(v[3], v[19]);
1424da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[4] = _mm_add_epi32(v[4], v[20]);
1425da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[5] = _mm_add_epi32(v[5], v[21]);
1426da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[6] = _mm_add_epi32(v[6], v[22]);
1427da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[7] = _mm_add_epi32(v[7], v[23]);
1428da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[8] = _mm_add_epi32(v[8], v[24]);
1429da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[9] = _mm_add_epi32(v[9], v[25]);
1430da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[10] = _mm_add_epi32(v[10], v[26]);
1431da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[11] = _mm_add_epi32(v[11], v[27]);
1432da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[12] = _mm_add_epi32(v[12], v[28]);
1433da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[13] = _mm_add_epi32(v[13], v[29]);
1434da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[14] = _mm_add_epi32(v[14], v[30]);
1435da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[15] = _mm_add_epi32(v[15], v[31]);
1436da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[16] = _mm_sub_epi32(v[0], v[16]);
1437da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[17] = _mm_sub_epi32(v[1], v[17]);
1438da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[18] = _mm_sub_epi32(v[2], v[18]);
1439da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[19] = _mm_sub_epi32(v[3], v[19]);
1440da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[20] = _mm_sub_epi32(v[4], v[20]);
1441da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[21] = _mm_sub_epi32(v[5], v[21]);
1442da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[22] = _mm_sub_epi32(v[6], v[22]);
1443da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[23] = _mm_sub_epi32(v[7], v[23]);
1444da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[24] = _mm_sub_epi32(v[8], v[24]);
1445da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[25] = _mm_sub_epi32(v[9], v[25]);
1446da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[26] = _mm_sub_epi32(v[10], v[26]);
1447da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[27] = _mm_sub_epi32(v[11], v[27]);
1448da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[28] = _mm_sub_epi32(v[12], v[28]);
1449da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[29] = _mm_sub_epi32(v[13], v[29]);
1450da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[30] = _mm_sub_epi32(v[14], v[30]);
1451da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[31] = _mm_sub_epi32(v[15], v[31]);
1452da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1453da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1454da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1455da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1456da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1457da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1458da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1459da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1460da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1461da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1462da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1463da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1464da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1465da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1466da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1467da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1468da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1469da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
1470da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
1471da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
1472da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
1473da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
1474da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
1475da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
1476da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
1477da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
1478da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
1479da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
1480da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
1481da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
1482da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
1483da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
1484da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
1485da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1486da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1487da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1488da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1489da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1490da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1491da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1492da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1493da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1494da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1495da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1496da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1497da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1498da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1499da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1500da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1501da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1502da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
1503da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
1504da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
1505da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
1506da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
1507da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
1508da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
1509da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
1510da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
1511da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
1512da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
1513da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
1514da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
1515da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
1516da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
1517da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
1518da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1519da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[0] = _mm_packs_epi32(u[0], u[1]);
1520da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[1] = _mm_packs_epi32(u[2], u[3]);
1521da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[2] = _mm_packs_epi32(u[4], u[5]);
1522da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[3] = _mm_packs_epi32(u[6], u[7]);
1523da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[4] = _mm_packs_epi32(u[8], u[9]);
1524da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[5] = _mm_packs_epi32(u[10], u[11]);
1525da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[6] = _mm_packs_epi32(u[12], u[13]);
1526da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[7] = _mm_packs_epi32(u[14], u[15]);
1527da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[8] = _mm_packs_epi32(u[16], u[17]);
1528da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[9] = _mm_packs_epi32(u[18], u[19]);
1529da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[10] = _mm_packs_epi32(u[20], u[21]);
1530da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[11] = _mm_packs_epi32(u[22], u[23]);
1531da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[12] = _mm_packs_epi32(u[24], u[25]);
1532da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[13] = _mm_packs_epi32(u[26], u[27]);
1533da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[14] = _mm_packs_epi32(u[28], u[29]);
1534da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[15] = _mm_packs_epi32(u[30], u[31]);
1535da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1536da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // stage 2
1537da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
1538da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
1539da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
1540da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
1541da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
1542da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
1543da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1544da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1545da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1546da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1547da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1548da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1549da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1550da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1551da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1552da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1553da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1554da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
1555da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
1556da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
1557da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
1558da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
1559da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
1560da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
1561da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
1562da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1563da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_add_epi32(v[0], v[8]);
1564da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_add_epi32(v[1], v[9]);
1565da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[2] = _mm_add_epi32(v[2], v[10]);
1566da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[3] = _mm_add_epi32(v[3], v[11]);
1567da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[4] = _mm_add_epi32(v[4], v[12]);
1568da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[5] = _mm_add_epi32(v[5], v[13]);
1569da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[6] = _mm_add_epi32(v[6], v[14]);
1570da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[7] = _mm_add_epi32(v[7], v[15]);
1571da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[8] = _mm_sub_epi32(v[0], v[8]);
1572da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[9] = _mm_sub_epi32(v[1], v[9]);
1573da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[10] = _mm_sub_epi32(v[2], v[10]);
1574da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[11] = _mm_sub_epi32(v[3], v[11]);
1575da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[12] = _mm_sub_epi32(v[4], v[12]);
1576da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[13] = _mm_sub_epi32(v[5], v[13]);
1577da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[14] = _mm_sub_epi32(v[6], v[14]);
1578da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[15] = _mm_sub_epi32(v[7], v[15]);
1579da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1580da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1581da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1582da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1583da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1584da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1585da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1586da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1587da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1588da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1589da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1590da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1591da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1592da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1593da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1594da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1595da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1596da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1597da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1598da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1599da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1600da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1601da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1602da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1603da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1604da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1605da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1606da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1607da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1608da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1609da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1610da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1611da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1612da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1613da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1614da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  x[0] = _mm_add_epi16(s[0], s[4]);
1615da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  x[1] = _mm_add_epi16(s[1], s[5]);
1616da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  x[2] = _mm_add_epi16(s[2], s[6]);
1617da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  x[3] = _mm_add_epi16(s[3], s[7]);
1618da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  x[4] = _mm_sub_epi16(s[0], s[4]);
1619da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  x[5] = _mm_sub_epi16(s[1], s[5]);
1620da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  x[6] = _mm_sub_epi16(s[2], s[6]);
1621da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  x[7] = _mm_sub_epi16(s[3], s[7]);
1622da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  x[8] = _mm_packs_epi32(u[0], u[1]);
1623da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  x[9] = _mm_packs_epi32(u[2], u[3]);
1624da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  x[10] = _mm_packs_epi32(u[4], u[5]);
1625da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  x[11] = _mm_packs_epi32(u[6], u[7]);
1626da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  x[12] = _mm_packs_epi32(u[8], u[9]);
1627da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  x[13] = _mm_packs_epi32(u[10], u[11]);
1628da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  x[14] = _mm_packs_epi32(u[12], u[13]);
1629da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  x[15] = _mm_packs_epi32(u[14], u[15]);
1630da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1631da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // stage 3
1632da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
1633da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
1634da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
1635da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
1636da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
1637da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
1638da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
1639da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
1640da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1641da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
1642da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
1643da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
1644da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
1645da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
1646da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
1647da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1648da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1649da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
1650da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
1651da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
1652da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
1653da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
1654da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
1655da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
1656da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
1657da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1658da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_add_epi32(v[0], v[4]);
1659da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_add_epi32(v[1], v[5]);
1660da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[2] = _mm_add_epi32(v[2], v[6]);
1661da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[3] = _mm_add_epi32(v[3], v[7]);
1662da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[4] = _mm_sub_epi32(v[0], v[4]);
1663da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[5] = _mm_sub_epi32(v[1], v[5]);
1664da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[6] = _mm_sub_epi32(v[2], v[6]);
1665da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[7] = _mm_sub_epi32(v[3], v[7]);
1666da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[8] = _mm_add_epi32(v[8], v[12]);
1667da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[9] = _mm_add_epi32(v[9], v[13]);
1668da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[10] = _mm_add_epi32(v[10], v[14]);
1669da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[11] = _mm_add_epi32(v[11], v[15]);
1670da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[12] = _mm_sub_epi32(v[8], v[12]);
1671da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[13] = _mm_sub_epi32(v[9], v[13]);
1672da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[14] = _mm_sub_epi32(v[10], v[14]);
1673da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[15] = _mm_sub_epi32(v[11], v[15]);
1674da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1675da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1676da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1677da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1678da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1679da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1680da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1681da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1682da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1683da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1684da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1685da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1686da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1687da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1688da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1689da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1690da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1691da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1692da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1693da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1694da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1695da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1696da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1697da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1698da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1699da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1700da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1701da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1702da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1703da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1704da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1705da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1706da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1707da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1708da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1709da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[0] = _mm_add_epi16(x[0], x[2]);
1710da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[1] = _mm_add_epi16(x[1], x[3]);
1711da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[2] = _mm_sub_epi16(x[0], x[2]);
1712da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[3] = _mm_sub_epi16(x[1], x[3]);
1713da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[4] = _mm_packs_epi32(v[0], v[1]);
1714da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[5] = _mm_packs_epi32(v[2], v[3]);
1715da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[6] = _mm_packs_epi32(v[4], v[5]);
1716da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[7] = _mm_packs_epi32(v[6], v[7]);
1717da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[8] = _mm_add_epi16(x[8], x[10]);
1718da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[9] = _mm_add_epi16(x[9], x[11]);
1719da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[10] = _mm_sub_epi16(x[8], x[10]);
1720da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[11] = _mm_sub_epi16(x[9], x[11]);
1721da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[12] = _mm_packs_epi32(v[8], v[9]);
1722da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[13] = _mm_packs_epi32(v[10], v[11]);
1723da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[14] = _mm_packs_epi32(v[12], v[13]);
1724da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[15] = _mm_packs_epi32(v[14], v[15]);
1725da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1726da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // stage 4
1727da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
1728da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
1729da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
1730da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
1731da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
1732da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
1733da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1734da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1735da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1736da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
1737da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
1738da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
1739da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
1740da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
1741da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
1742da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
1743da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
1744da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
1745da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
1746da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
1747da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
1748da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
1749da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
1750da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
1751da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
1752da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1753da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1754da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1755da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1756da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1757da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1758da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1759da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1760da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1761da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1762da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1763da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1764da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1765da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1766da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1767da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1768da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1769da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1770da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1771da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1772da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1773da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1774da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1775da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1776da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1777da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1778da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1779da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1780da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1781da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1782da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1783da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1784da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1785da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1786da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1787da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[0] = s[0];
1788da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[1] = _mm_sub_epi16(kZero, s[8]);
1789da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[2] = s[12];
1790da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[3] = _mm_sub_epi16(kZero, s[4]);
1791da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[4] = _mm_packs_epi32(v[4], v[5]);
1792da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[5] = _mm_packs_epi32(v[12], v[13]);
1793da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[6] = _mm_packs_epi32(v[8], v[9]);
1794da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[7] = _mm_packs_epi32(v[0], v[1]);
1795da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[8] = _mm_packs_epi32(v[2], v[3]);
1796da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[9] = _mm_packs_epi32(v[10], v[11]);
1797da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[10] = _mm_packs_epi32(v[14], v[15]);
1798da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[11] = _mm_packs_epi32(v[6], v[7]);
1799da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[12] = s[5];
1800da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[13] = _mm_sub_epi16(kZero, s[13]);
1801da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[14] = s[9];
1802da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[15] = _mm_sub_epi16(kZero, s[1]);
1803da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
1804da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1805da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void idct16_8col(__m128i *in) {
1806da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1807da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
1808da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1809da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
1810da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1811da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
1812da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1813da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
1814da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1815da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1816da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1817da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1818da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
1819da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1820da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1821da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1822da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1823da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
1824da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1825da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1826da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1827da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i v[16], u[16], s[16], t[16];
1828da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1829da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // stage 1
1830da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[0] = in[0];
1831da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[1] = in[8];
1832da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[2] = in[4];
1833da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[3] = in[12];
1834da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[4] = in[2];
1835da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[5] = in[10];
1836da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[6] = in[6];
1837da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[7] = in[14];
1838da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[8] = in[1];
1839da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[9] = in[9];
1840da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[10] = in[5];
1841da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[11] = in[13];
1842da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[12] = in[3];
1843da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[13] = in[11];
1844da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[14] = in[7];
1845da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[15] = in[15];
1846da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1847da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // stage 2
1848da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_unpacklo_epi16(s[8], s[15]);
1849da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_unpackhi_epi16(s[8], s[15]);
1850da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[2] = _mm_unpacklo_epi16(s[9], s[14]);
1851da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[3] = _mm_unpackhi_epi16(s[9], s[14]);
1852da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[4] = _mm_unpacklo_epi16(s[10], s[13]);
1853da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[5] = _mm_unpackhi_epi16(s[10], s[13]);
1854da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[6] = _mm_unpacklo_epi16(s[11], s[12]);
1855da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[7] = _mm_unpackhi_epi16(s[11], s[12]);
1856da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1857da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
1858da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
1859da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
1860da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
1861da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
1862da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
1863da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
1864da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
1865da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
1866da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
1867da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
1868da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
1869da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
1870da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
1871da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
1872da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
1873da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1874da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1875da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1876da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1877da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1878da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1879da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1880da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1881da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1882da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1883da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1884da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1885da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1886da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1887da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1888da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1889da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1890da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1891da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1892da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1893da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1894da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1895da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1896da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1897da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1898da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1899da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1900da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1901da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1902da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1903da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1904da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1905da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1906da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1907da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1908da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[8]  = _mm_packs_epi32(u[0], u[1]);
1909da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[15] = _mm_packs_epi32(u[2], u[3]);
1910da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[9]  = _mm_packs_epi32(u[4], u[5]);
1911da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[14] = _mm_packs_epi32(u[6], u[7]);
1912da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[10] = _mm_packs_epi32(u[8], u[9]);
1913da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[13] = _mm_packs_epi32(u[10], u[11]);
1914da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[11] = _mm_packs_epi32(u[12], u[13]);
1915da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[12] = _mm_packs_epi32(u[14], u[15]);
1916da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1917da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // stage 3
1918da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[0] = s[0];
1919da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[1] = s[1];
1920da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[2] = s[2];
1921da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[3] = s[3];
1922da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_unpacklo_epi16(s[4], s[7]);
1923da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_unpackhi_epi16(s[4], s[7]);
1924da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[2] = _mm_unpacklo_epi16(s[5], s[6]);
1925da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[3] = _mm_unpackhi_epi16(s[5], s[6]);
1926da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1927da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1928da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1929da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1930da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1931da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1932da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1933da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1934da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1935da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1936da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1937da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1938da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1939da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1940da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1941da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1942da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1943da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1944da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1945da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1946da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1947da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1948da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1949da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1950da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1951da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1952da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1953da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1954da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[4] = _mm_packs_epi32(u[0], u[1]);
1955da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[7] = _mm_packs_epi32(u[2], u[3]);
1956da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[5] = _mm_packs_epi32(u[4], u[5]);
1957da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[6] = _mm_packs_epi32(u[6], u[7]);
1958da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[8] = _mm_add_epi16(s[8], s[9]);
1959da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[9] = _mm_sub_epi16(s[8], s[9]);
1960da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[10] = _mm_sub_epi16(s[11], s[10]);
1961da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[11] = _mm_add_epi16(s[10], s[11]);
1962da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[12] = _mm_add_epi16(s[12], s[13]);
1963da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[13] = _mm_sub_epi16(s[12], s[13]);
1964da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[14] = _mm_sub_epi16(s[15], s[14]);
1965da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[15] = _mm_add_epi16(s[14], s[15]);
1966da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1967da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // stage 4
1968da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_unpacklo_epi16(t[0], t[1]);
1969da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_unpackhi_epi16(t[0], t[1]);
1970da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[2] = _mm_unpacklo_epi16(t[2], t[3]);
1971da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[3] = _mm_unpackhi_epi16(t[2], t[3]);
1972da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[4] = _mm_unpacklo_epi16(t[9], t[14]);
1973da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[5] = _mm_unpackhi_epi16(t[9], t[14]);
1974da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[6] = _mm_unpacklo_epi16(t[10], t[13]);
1975da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[7] = _mm_unpackhi_epi16(t[10], t[13]);
1976da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1977da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
1978da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
1979da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
1980da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
1981da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
1982da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
1983da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1984da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1985da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
1986da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
1987da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
1988da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
1989da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
1990da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
1991da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
1992da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
1993da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1994da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1995da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1996da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1997da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1998da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1999da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2000da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2001da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2002da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
2003da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
2004da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
2005da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
2006da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
2007da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
2008da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
2009da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
2010da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2011da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2012da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2013da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2014da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2015da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2016da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2017da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2018da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2019da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
2020da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
2021da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
2022da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
2023da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
2024da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
2025da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
2026da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
2027da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2028da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[0] = _mm_packs_epi32(u[0], u[1]);
2029da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[1] = _mm_packs_epi32(u[2], u[3]);
2030da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[2] = _mm_packs_epi32(u[4], u[5]);
2031da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[3] = _mm_packs_epi32(u[6], u[7]);
2032da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[4] = _mm_add_epi16(t[4], t[5]);
2033da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[5] = _mm_sub_epi16(t[4], t[5]);
2034da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[6] = _mm_sub_epi16(t[7], t[6]);
2035da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[7] = _mm_add_epi16(t[6], t[7]);
2036da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[8] = t[8];
2037da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[15] = t[15];
2038da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[9]  = _mm_packs_epi32(u[8], u[9]);
2039da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[14] = _mm_packs_epi32(u[10], u[11]);
2040da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[10] = _mm_packs_epi32(u[12], u[13]);
2041da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[13] = _mm_packs_epi32(u[14], u[15]);
2042da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[11] = t[11];
2043da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[12] = t[12];
2044da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2045da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // stage 5
2046da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[0] = _mm_add_epi16(s[0], s[3]);
2047da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[1] = _mm_add_epi16(s[1], s[2]);
2048da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[2] = _mm_sub_epi16(s[1], s[2]);
2049da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[3] = _mm_sub_epi16(s[0], s[3]);
2050da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[4] = s[4];
2051da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[7] = s[7];
2052da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2053da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_unpacklo_epi16(s[5], s[6]);
2054da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_unpackhi_epi16(s[5], s[6]);
2055da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
2056da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
2057da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2058da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2059da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2060da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2061da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2062da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2063da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2064da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2065da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2066da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2067da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[5] = _mm_packs_epi32(u[0], u[1]);
2068da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[6] = _mm_packs_epi32(u[2], u[3]);
2069da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2070da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[8] = _mm_add_epi16(s[8], s[11]);
2071da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[9] = _mm_add_epi16(s[9], s[10]);
2072da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[10] = _mm_sub_epi16(s[9], s[10]);
2073da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[11] = _mm_sub_epi16(s[8], s[11]);
2074da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[12] = _mm_sub_epi16(s[15], s[12]);
2075da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[13] = _mm_sub_epi16(s[14], s[13]);
2076da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[14] = _mm_add_epi16(s[13], s[14]);
2077da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  t[15] = _mm_add_epi16(s[12], s[15]);
2078da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2079da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // stage 6
2080da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[0] = _mm_add_epi16(t[0], t[7]);
2081da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[1] = _mm_add_epi16(t[1], t[6]);
2082da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[2] = _mm_add_epi16(t[2], t[5]);
2083da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[3] = _mm_add_epi16(t[3], t[4]);
2084da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[4] = _mm_sub_epi16(t[3], t[4]);
2085da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[5] = _mm_sub_epi16(t[2], t[5]);
2086da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[6] = _mm_sub_epi16(t[1], t[6]);
2087da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[7] = _mm_sub_epi16(t[0], t[7]);
2088da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[8] = t[8];
2089da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[9] = t[9];
2090da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2091da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_unpacklo_epi16(t[10], t[13]);
2092da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_unpackhi_epi16(t[10], t[13]);
2093da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[2] = _mm_unpacklo_epi16(t[11], t[12]);
2094da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[3] = _mm_unpackhi_epi16(t[11], t[12]);
2095da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2096da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
2097da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
2098da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2099da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2100da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
2101da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
2102da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
2103da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
2104da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2105da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2106da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2107da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2108da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2109da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2110da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2111da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2112da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2113da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2114da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2115da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2116da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2117da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2118da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2119da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2120da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2121da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2122da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2123da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[10] = _mm_packs_epi32(u[0], u[1]);
2124da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[13] = _mm_packs_epi32(u[2], u[3]);
2125da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[11] = _mm_packs_epi32(u[4], u[5]);
2126da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[12] = _mm_packs_epi32(u[6], u[7]);
2127da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[14] = t[14];
2128da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  s[15] = t[15];
2129da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2130da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // stage 7
2131da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[0] = _mm_add_epi16(s[0], s[15]);
2132da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[1] = _mm_add_epi16(s[1], s[14]);
2133da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[2] = _mm_add_epi16(s[2], s[13]);
2134da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[3] = _mm_add_epi16(s[3], s[12]);
2135da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[4] = _mm_add_epi16(s[4], s[11]);
2136da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[5] = _mm_add_epi16(s[5], s[10]);
2137da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[6] = _mm_add_epi16(s[6], s[9]);
2138da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[7] = _mm_add_epi16(s[7], s[8]);
2139da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[8] = _mm_sub_epi16(s[7], s[8]);
2140da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[9] = _mm_sub_epi16(s[6], s[9]);
2141da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[10] = _mm_sub_epi16(s[5], s[10]);
2142da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[11] = _mm_sub_epi16(s[4], s[11]);
2143da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[12] = _mm_sub_epi16(s[3], s[12]);
2144da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[13] = _mm_sub_epi16(s[2], s[13]);
2145da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[14] = _mm_sub_epi16(s[1], s[14]);
2146da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  in[15] = _mm_sub_epi16(s[0], s[15]);
2147da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
2148da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2149da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid idct16_sse2(__m128i *in0, __m128i *in1) {
2150da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  array_transpose_16x16(in0, in1);
2151da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  idct16_8col(in0);
2152da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  idct16_8col(in1);
2153da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
2154da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2155da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid iadst16_sse2(__m128i *in0, __m128i *in1) {
2156da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  array_transpose_16x16(in0, in1);
2157da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  iadst16_8col(in0);
2158da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  iadst16_8col(in1);
2159da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
2160da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
21612263fc984bdc858ee931d3e35c87c404de923950Johannvoid vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
2162da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                               int stride) {
2163da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
2164da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
2165da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i zero = _mm_setzero_si128();
2166da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2167da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2168da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
2169da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2170da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
2171da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2172da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2173da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
2174da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2175da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
2176da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2177da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2178da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
2179da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
2180da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2181da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2182da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
2183da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i in[16], l[16];
2184da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6,
2185da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
2186da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          stp1_8_0, stp1_12_0;
2187da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
2188da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
2189da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2190da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int i;
2191da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // First 1-D inverse DCT
2192da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Load input data.
21932263fc984bdc858ee931d3e35c87c404de923950Johann  in[0] = load_input_data(input);
21942263fc984bdc858ee931d3e35c87c404de923950Johann  in[1] = load_input_data(input + 8 * 2);
21952263fc984bdc858ee931d3e35c87c404de923950Johann  in[2] = load_input_data(input + 8 * 4);
21962263fc984bdc858ee931d3e35c87c404de923950Johann  in[3] = load_input_data(input + 8 * 6);
2197da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2198da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
2199da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2200da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Stage2
2201da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  {
2202da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
2203da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
2204da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2205da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
2206da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
2207da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
2208da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
2209da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2210da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = _mm_add_epi32(tmp0, rounding);
2211da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = _mm_add_epi32(tmp2, rounding);
2212da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp5 = _mm_add_epi32(tmp5, rounding);
2213da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp7 = _mm_add_epi32(tmp7, rounding);
2214da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2215da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2216da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2217da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2218da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2219da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2220da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_8  = _mm_packs_epi32(tmp0, tmp2);
2221da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_11 = _mm_packs_epi32(tmp5, tmp7);
2222da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
2223da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2224da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Stage3
2225da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  {
2226da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
2227da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2228da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
2229da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
2230da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2231da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = _mm_add_epi32(tmp0, rounding);
2232da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = _mm_add_epi32(tmp2, rounding);
2233da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2234da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2235da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2236da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
2237da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
2238da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2239da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
2240da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
2241da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2242da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Stage4
2243da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  {
2244da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
2245da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
2246da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
2247da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2248da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
2249da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
2250da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
2251da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
2252da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
2253da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
2254da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2255da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = _mm_add_epi32(tmp0, rounding);
2256da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = _mm_add_epi32(tmp2, rounding);
2257da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp1 = _mm_add_epi32(tmp1, rounding);
2258da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp3 = _mm_add_epi32(tmp3, rounding);
2259da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp5 = _mm_add_epi32(tmp5, rounding);
2260da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp7 = _mm_add_epi32(tmp7, rounding);
2261da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2262da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2263da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2264da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2265da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2266da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2267da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2268da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2269da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_0 = _mm_packs_epi32(tmp0, tmp0);
2270da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_1 = _mm_packs_epi32(tmp2, tmp2);
2271da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_9 = _mm_packs_epi32(tmp1, tmp3);
2272da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_10 = _mm_packs_epi32(tmp5, tmp7);
2273da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2274da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
2275da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
2276da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2277da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Stage5 and Stage6
2278da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  {
2279da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = _mm_add_epi16(stp2_8, stp2_11);
2280da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
2281da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = _mm_add_epi16(stp2_9, stp2_10);
2282da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
2283da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2284da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_9  = _mm_unpacklo_epi64(tmp2, zero);
2285da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
2286da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_8  = _mm_unpacklo_epi64(tmp0, zero);
2287da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
2288da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2289da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
2290da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
2291da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
2292da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
2293da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
2294da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2295da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Stage6
2296da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  {
2297da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
2298da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
2299da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
2300da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2301da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
2302da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
2303da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
2304da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
2305da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
2306da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
2307da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2308da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp1 = _mm_add_epi32(tmp1, rounding);
2309da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp3 = _mm_add_epi32(tmp3, rounding);
2310da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = _mm_add_epi32(tmp0, rounding);
2311da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = _mm_add_epi32(tmp2, rounding);
2312da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp4 = _mm_add_epi32(tmp4, rounding);
2313da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp6 = _mm_add_epi32(tmp6, rounding);
2314da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2315da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2316da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2317da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2318da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2319da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2320da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2321da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2322da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp1_6 = _mm_packs_epi32(tmp3, tmp1);
2323da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2324da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_10 = _mm_packs_epi32(tmp0, zero);
2325da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_13 = _mm_packs_epi32(tmp2, zero);
2326da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_11 = _mm_packs_epi32(tmp4, zero);
2327da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_12 = _mm_packs_epi32(tmp6, zero);
2328da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2329da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = _mm_add_epi16(stp1_0, stp1_4);
2330da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
2331da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = _mm_add_epi16(stp1_1, stp1_6);
2332da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
2333da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2334da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
2335da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
2336da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
2337da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
2338da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
2339da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
2340da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
2341da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
2342da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
2343da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2344da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Stage7. Left 8x16 only.
2345da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  l[0] = _mm_add_epi16(stp2_0, stp1_15);
2346da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  l[1] = _mm_add_epi16(stp2_1, stp1_14);
2347da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  l[2] = _mm_add_epi16(stp2_2, stp2_13);
2348da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  l[3] = _mm_add_epi16(stp2_3, stp2_12);
2349da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  l[4] = _mm_add_epi16(stp2_4, stp2_11);
2350da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  l[5] = _mm_add_epi16(stp2_5, stp2_10);
2351da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  l[6] = _mm_add_epi16(stp2_6, stp1_9);
2352da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  l[7] = _mm_add_epi16(stp2_7, stp1_8);
2353da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  l[8] = _mm_sub_epi16(stp2_7, stp1_8);
2354da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  l[9] = _mm_sub_epi16(stp2_6, stp1_9);
2355da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  l[10] = _mm_sub_epi16(stp2_5, stp2_10);
2356da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  l[11] = _mm_sub_epi16(stp2_4, stp2_11);
2357da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  l[12] = _mm_sub_epi16(stp2_3, stp2_12);
2358da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  l[13] = _mm_sub_epi16(stp2_2, stp2_13);
2359da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  l[14] = _mm_sub_epi16(stp2_1, stp1_14);
2360da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  l[15] = _mm_sub_epi16(stp2_0, stp1_15);
2361da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2362da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Second 1-D inverse transform, performed per 8x16 block
2363da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (i = 0; i < 2; i++) {
2364da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    int j;
2365da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    array_transpose_4X8(l + 8 * i, in);
2366da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2367da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    IDCT16_10
2368da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2369da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Stage7
2370da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[0] = _mm_add_epi16(stp2_0, stp1_15);
2371da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[1] = _mm_add_epi16(stp2_1, stp1_14);
2372da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[2] = _mm_add_epi16(stp2_2, stp2_13);
2373da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[3] = _mm_add_epi16(stp2_3, stp2_12);
2374da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[4] = _mm_add_epi16(stp2_4, stp2_11);
2375da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[5] = _mm_add_epi16(stp2_5, stp2_10);
2376da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[6] = _mm_add_epi16(stp2_6, stp1_9);
2377da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[7] = _mm_add_epi16(stp2_7, stp1_8);
2378da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
2379da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
2380da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
2381da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
2382da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
2383da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
2384da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
2385da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
2386da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2387da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    for (j = 0; j < 16; ++j) {
2388da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      // Final rounding and shift
2389da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      in[j] = _mm_adds_epi16(in[j], final_rounding);
2390da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      in[j] = _mm_srai_epi16(in[j], 6);
2391da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      RECON_AND_STORE(dest + j * stride, in[j]);
2392da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
2393da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2394da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dest += 8;
2395da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
2396da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
2397da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2398da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define LOAD_DQCOEFF(reg, input) \
2399da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  {  \
24002263fc984bdc858ee931d3e35c87c404de923950Johann    reg = load_input_data(input); \
2401da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    input += 8; \
2402da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }  \
2403da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2404da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define IDCT32_34 \
2405da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* Stage1 */ \
2406da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{ \
2407da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i zero = _mm_setzero_si128();\
2408da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
2409da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
2410da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2411da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
2412da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
2413da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2414da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
2415da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
2416da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2417da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
2418da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
2419da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2420da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
2421da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg1_1, stp1_16, stp1_31); \
2422da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
2423da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg1_7, stp1_19, stp1_28); \
2424da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
2425da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg1_9, stp1_20, stp1_27); \
2426da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
2427da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg1_15, stp1_23, stp1_24); \
2428da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} \
2429da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian\
2430da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* Stage2 */ \
2431da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{ \
2432da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i zero = _mm_setzero_si128();\
2433da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
2434da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
2435da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2436da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
2437da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
2438da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2439da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
2440da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg2_1, stp2_8, stp2_15); \
2441da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
2442da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg2_7, stp2_11, stp2_12); \
2443da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2444da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_16 = stp1_16; \
2445da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_19 = stp1_19; \
2446da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2447da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_20 = stp1_20; \
2448da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_23 = stp1_23; \
2449da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2450da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_24 = stp1_24; \
2451da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_27 = stp1_27; \
2452da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2453da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_28 = stp1_28; \
2454da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_31 = stp1_31; \
2455da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} \
2456da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian\
2457da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* Stage3 */ \
2458da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{ \
2459da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i zero = _mm_setzero_si128();\
2460da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
2461da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
2462da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2463da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
2464da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
2465da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
2466da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
2467da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2468da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
2469da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
2470da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
2471da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
2472da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2473da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
2474da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg3_1, stp1_4, stp1_7); \
2475da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2476da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_8 = stp2_8; \
2477da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_11 = stp2_11; \
2478da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_12 = stp2_12; \
2479da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_15 = stp2_15; \
2480da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2481da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
2482da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
2483da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stp1_18, stp1_29) \
2484da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
2485da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
2486da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stp1_22, stp1_25) \
2487da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2488da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_16 = stp2_16; \
2489da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_31 = stp2_31; \
2490da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_19 = stp2_19; \
2491da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_20 = stp2_20; \
2492da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_23 = stp2_23; \
2493da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_24 = stp2_24; \
2494da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_27 = stp2_27; \
2495da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_28 = stp2_28; \
2496da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} \
2497da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian\
2498da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* Stage4 */ \
2499da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{ \
2500da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i zero = _mm_setzero_si128();\
2501da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
2502da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
2503da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2504da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
2505da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
2506da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
2507da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
2508da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2509da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
2510da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg4_1, stp2_0, stp2_1); \
2511da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2512da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_4 = stp1_4; \
2513da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_5 = stp1_4; \
2514da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_6 = stp1_7; \
2515da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_7 = stp1_7; \
2516da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2517da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
2518da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
2519da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stp2_10, stp2_13) \
2520da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2521da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_8 = stp1_8; \
2522da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_15 = stp1_15; \
2523da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_11 = stp1_11; \
2524da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_12 = stp1_12; \
2525da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2526da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
2527da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
2528da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
2529da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
2530da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
2531da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
2532da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
2533da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
2534da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2535da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
2536da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
2537da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
2538da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
2539da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
2540da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
2541da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
2542da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
2543da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} \
2544da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian\
2545da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* Stage5 */ \
2546da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{ \
2547da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
2548da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
2549da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2550da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2551da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2552da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
2553da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
2554da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2555da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2556da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2557da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2558da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2559da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2560da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_0 = stp2_0; \
2561da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_1 = stp2_1; \
2562da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_2 = stp2_1; \
2563da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_3 = stp2_0; \
2564da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2565da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
2566da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
2567da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
2568da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
2569da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2570da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp0 = _mm_add_epi32(tmp0, rounding); \
2571da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp1 = _mm_add_epi32(tmp1, rounding); \
2572da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp2 = _mm_add_epi32(tmp2, rounding); \
2573da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp3 = _mm_add_epi32(tmp3, rounding); \
2574da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2575da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
2576da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
2577da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
2578da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
2579da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2580da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
2581da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
2582da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2583da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_4 = stp2_4; \
2584da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_7 = stp2_7; \
2585da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2586da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
2587da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
2588da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
2589da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
2590da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
2591da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
2592da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
2593da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
2594da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2595da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_16 = stp2_16; \
2596da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_17 = stp2_17; \
2597da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2598da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
2599da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
2600da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stp1_19, stp1_28) \
2601da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
2602da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
2603da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stp1_21, stp1_26) \
2604da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2605da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_22 = stp2_22; \
2606da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_23 = stp2_23; \
2607da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_24 = stp2_24; \
2608da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_25 = stp2_25; \
2609da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_30 = stp2_30; \
2610da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_31 = stp2_31; \
2611da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} \
2612da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian\
2613da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* Stage6 */ \
2614da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{ \
2615da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
2616da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
2617da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
2618da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
2619da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2620da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
2621da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
2622da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
2623da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
2624da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
2625da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
2626da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
2627da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
2628da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2629da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_8 = stp1_8; \
2630da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_9 = stp1_9; \
2631da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_14 = stp1_14; \
2632da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_15 = stp1_15; \
2633da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2634da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
2635da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
2636da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stp2_13, stp2_11, stp2_12) \
2637da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2638da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
2639da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
2640da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
2641da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
2642da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
2643da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
2644da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
2645da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
2646da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2647da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
2648da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
2649da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
2650da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
2651da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
2652da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
2653da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
2654da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
2655da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} \
2656da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian\
2657da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* Stage7 */ \
2658da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{ \
2659da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2660da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2661da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2662da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2663da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2664da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
2665da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
2666da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
2667da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
2668da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2669da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
2670da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
2671da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
2672da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
2673da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
2674da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
2675da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
2676da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
2677da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
2678da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
2679da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
2680da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
2681da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
2682da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
2683da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
2684da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
2685da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2686da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_16 = stp2_16; \
2687da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_17 = stp2_17; \
2688da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_18 = stp2_18; \
2689da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_19 = stp2_19; \
2690da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2691da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
2692da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
2693da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stp1_21, stp1_26) \
2694da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
2695da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
2696da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stp1_23, stp1_24) \
2697da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2698da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_28 = stp2_28; \
2699da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_29 = stp2_29; \
2700da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_30 = stp2_30; \
2701da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_31 = stp2_31; \
2702da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
2703da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2704da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2705da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define IDCT32 \
2706da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* Stage1 */ \
2707da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{ \
2708da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
2709da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
2710da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
2711da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
2712da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2713da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
2714da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
2715da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
2716da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
2717da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2718da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
2719da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
2720da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
2721da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
2722da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2723da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
2724da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
2725da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
2726da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
2727da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2728da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
2729da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
2730da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stp1_17, stp1_30) \
2731da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
2732da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
2733da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stp1_19, stp1_28) \
2734da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
2735da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
2736da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stp1_21, stp1_26) \
2737da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
2738da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
2739da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stp1_23, stp1_24) \
2740da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} \
2741da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian\
2742da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* Stage2 */ \
2743da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{ \
2744da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
2745da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
2746da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
2747da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
2748da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2749da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
2750da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
2751da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
2752da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
2753da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2754da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
2755da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
2756da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stp2_14) \
2757da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
2758da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
2759da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stp2_11, stp2_12) \
2760da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2761da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
2762da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
2763da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
2764da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
2765da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2766da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
2767da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
2768da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
2769da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
2770da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2771da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
2772da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
2773da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
2774da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
2775da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2776da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
2777da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
2778da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
2779da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
2780da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} \
2781da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian\
2782da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* Stage3 */ \
2783da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{ \
2784da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
2785da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
2786da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
2787da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
2788da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2789da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
2790da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
2791da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2792da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2793da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2794da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2795da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2796da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
2797da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
2798da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2799da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
2800da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
2801da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stp1_6) \
2802da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2803da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
2804da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
2805da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
2806da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
2807da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
2808da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
2809da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
2810da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
2811da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2812da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
2813da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
2814da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stp1_18, stp1_29) \
2815da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
2816da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
2817da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stp1_22, stp1_25) \
2818da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2819da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_16 = stp2_16; \
2820da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_31 = stp2_31; \
2821da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_19 = stp2_19; \
2822da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_20 = stp2_20; \
2823da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_23 = stp2_23; \
2824da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_24 = stp2_24; \
2825da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_27 = stp2_27; \
2826da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_28 = stp2_28; \
2827da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} \
2828da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian\
2829da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* Stage4 */ \
2830da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{ \
2831da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
2832da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
2833da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
2834da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
2835da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2836da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
2837da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
2838da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
2839da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
2840da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2841da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
2842da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
2843da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stp2_2, stp2_3) \
2844da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2845da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
2846da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
2847da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
2848da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
2849da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2850da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
2851da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
2852da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stp2_10, stp2_13) \
2853da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2854da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_8 = stp1_8; \
2855da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_15 = stp1_15; \
2856da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_11 = stp1_11; \
2857da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_12 = stp1_12; \
2858da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2859da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
2860da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
2861da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
2862da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
2863da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
2864da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
2865da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
2866da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
2867da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2868da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
2869da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
2870da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
2871da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
2872da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
2873da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
2874da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
2875da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
2876da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} \
2877da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian\
2878da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* Stage5 */ \
2879da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{ \
2880da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
2881da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
2882da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2883da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2884da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2885da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
2886da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
2887da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2888da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2889da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2890da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2891da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2892da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2893da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
2894da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
2895da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
2896da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
2897da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2898da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
2899da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
2900da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
2901da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
2902da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2903da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp0 = _mm_add_epi32(tmp0, rounding); \
2904da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp1 = _mm_add_epi32(tmp1, rounding); \
2905da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp2 = _mm_add_epi32(tmp2, rounding); \
2906da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp3 = _mm_add_epi32(tmp3, rounding); \
2907da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2908da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
2909da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
2910da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
2911da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
2912da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2913da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
2914da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
2915da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2916da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_4 = stp2_4; \
2917da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_7 = stp2_7; \
2918da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2919da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
2920da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
2921da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
2922da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
2923da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
2924da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
2925da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
2926da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
2927da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2928da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_16 = stp2_16; \
2929da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_17 = stp2_17; \
2930da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2931da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
2932da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
2933da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stp1_19, stp1_28) \
2934da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
2935da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
2936da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stp1_21, stp1_26) \
2937da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2938da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_22 = stp2_22; \
2939da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_23 = stp2_23; \
2940da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_24 = stp2_24; \
2941da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_25 = stp2_25; \
2942da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_30 = stp2_30; \
2943da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_31 = stp2_31; \
2944da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} \
2945da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian\
2946da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* Stage6 */ \
2947da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{ \
2948da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
2949da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
2950da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
2951da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
2952da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2953da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
2954da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
2955da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
2956da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
2957da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
2958da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
2959da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
2960da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
2961da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2962da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_8 = stp1_8; \
2963da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_9 = stp1_9; \
2964da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_14 = stp1_14; \
2965da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_15 = stp1_15; \
2966da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2967da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
2968da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
2969da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stp2_13, stp2_11, stp2_12) \
2970da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2971da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
2972da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
2973da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
2974da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
2975da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
2976da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
2977da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
2978da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
2979da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2980da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
2981da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
2982da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
2983da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
2984da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
2985da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
2986da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
2987da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
2988da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} \
2989da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian\
2990da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* Stage7 */ \
2991da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{ \
2992da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2993da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2994da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2995da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2996da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
2997da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
2998da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
2999da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
3000da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
3001da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
3002da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
3003da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
3004da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
3005da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
3006da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
3007da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
3008da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
3009da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
3010da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
3011da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
3012da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
3013da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
3014da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
3015da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
3016da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
3017da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
3018da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
3019da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_16 = stp2_16; \
3020da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_17 = stp2_17; \
3021da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_18 = stp2_18; \
3022da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_19 = stp2_19; \
3023da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
3024da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
3025da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
3026da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stp1_21, stp1_26) \
3027da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
3028da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
3029da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         stp1_23, stp1_24) \
3030da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  \
3031da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_28 = stp2_28; \
3032da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_29 = stp2_29; \
3033da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_30 = stp2_30; \
3034da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  stp1_31 = stp2_31; \
3035da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
3036da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3037da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian// Only upper-left 8x8 has non-zero coeff
30382263fc984bdc858ee931d3e35c87c404de923950Johannvoid vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
3039da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                               int stride) {
3040da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3041da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i final_rounding = _mm_set1_epi16(1<<5);
3042da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3043da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // idct constants for each stage
3044da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3045da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3046da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3047da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3048da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3049da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3050da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3051da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3052da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3053da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3054da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3055da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3056da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3057da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3058da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3059da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3060da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3061da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3062da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3063da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3064da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3065da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3066da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3067da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3068da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3069da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3070da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3071da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3072da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3073da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3074da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3075da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i in[32], col[32];
3076da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3077da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3078da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3079da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3080da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          stp1_30, stp1_31;
3081da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3082da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3083da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3084da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3085da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          stp2_30, stp2_31;
3086da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3087da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int i;
3088da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3089da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Load input data. Only need to load the top left 8x8 block.
30902263fc984bdc858ee931d3e35c87c404de923950Johann  in[0] = load_input_data(input);
30912263fc984bdc858ee931d3e35c87c404de923950Johann  in[1] = load_input_data(input + 32);
30922263fc984bdc858ee931d3e35c87c404de923950Johann  in[2] = load_input_data(input + 64);
30932263fc984bdc858ee931d3e35c87c404de923950Johann  in[3] = load_input_data(input + 96);
30942263fc984bdc858ee931d3e35c87c404de923950Johann  in[4] = load_input_data(input + 128);
30952263fc984bdc858ee931d3e35c87c404de923950Johann  in[5] = load_input_data(input + 160);
30962263fc984bdc858ee931d3e35c87c404de923950Johann  in[6] = load_input_data(input + 192);
30972263fc984bdc858ee931d3e35c87c404de923950Johann  in[7] = load_input_data(input + 224);
3098da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3099da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (i = 8; i < 32; ++i) {
3100da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[i] = _mm_setzero_si128();
3101da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
3102da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3103da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  array_transpose_8x8(in, in);
3104da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // TODO(hkuang): Following transposes are unnecessary. But remove them will
3105da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // lead to performance drop on some devices.
3106da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  array_transpose_8x8(in + 8, in + 8);
3107da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  array_transpose_8x8(in + 16, in + 16);
3108da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  array_transpose_8x8(in + 24, in + 24);
3109da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3110da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  IDCT32_34
3111da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3112da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // 1_D: Store 32 intermediate results for each 8x32 block.
3113da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[0] = _mm_add_epi16(stp1_0, stp1_31);
3114da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[1] = _mm_add_epi16(stp1_1, stp1_30);
3115da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[2] = _mm_add_epi16(stp1_2, stp1_29);
3116da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[3] = _mm_add_epi16(stp1_3, stp1_28);
3117da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[4] = _mm_add_epi16(stp1_4, stp1_27);
3118da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[5] = _mm_add_epi16(stp1_5, stp1_26);
3119da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[6] = _mm_add_epi16(stp1_6, stp1_25);
3120da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[7] = _mm_add_epi16(stp1_7, stp1_24);
3121da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[8] = _mm_add_epi16(stp1_8, stp1_23);
3122da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[9] = _mm_add_epi16(stp1_9, stp1_22);
3123da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[10] = _mm_add_epi16(stp1_10, stp1_21);
3124da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[11] = _mm_add_epi16(stp1_11, stp1_20);
3125da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[12] = _mm_add_epi16(stp1_12, stp1_19);
3126da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[13] = _mm_add_epi16(stp1_13, stp1_18);
3127da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[14] = _mm_add_epi16(stp1_14, stp1_17);
3128da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[15] = _mm_add_epi16(stp1_15, stp1_16);
3129da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[16] = _mm_sub_epi16(stp1_15, stp1_16);
3130da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[17] = _mm_sub_epi16(stp1_14, stp1_17);
3131da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[18] = _mm_sub_epi16(stp1_13, stp1_18);
3132da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[19] = _mm_sub_epi16(stp1_12, stp1_19);
3133da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[20] = _mm_sub_epi16(stp1_11, stp1_20);
3134da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[21] = _mm_sub_epi16(stp1_10, stp1_21);
3135da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[22] = _mm_sub_epi16(stp1_9, stp1_22);
3136da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[23] = _mm_sub_epi16(stp1_8, stp1_23);
3137da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[24] = _mm_sub_epi16(stp1_7, stp1_24);
3138da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[25] = _mm_sub_epi16(stp1_6, stp1_25);
3139da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[26] = _mm_sub_epi16(stp1_5, stp1_26);
3140da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[27] = _mm_sub_epi16(stp1_4, stp1_27);
3141da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[28] = _mm_sub_epi16(stp1_3, stp1_28);
3142da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[29] = _mm_sub_epi16(stp1_2, stp1_29);
3143da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[30] = _mm_sub_epi16(stp1_1, stp1_30);
3144da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  col[31] = _mm_sub_epi16(stp1_0, stp1_31);
3145da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (i = 0; i < 4; i++) {
3146da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    int j;
3147da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const __m128i zero = _mm_setzero_si128();
3148da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Transpose 32x8 block to 8x32 block
3149da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    array_transpose_8x8(col + i * 8, in);
3150da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    IDCT32_34
3151da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3152da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // 2_D: Calculate the results and store them to destination.
3153da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[0] = _mm_add_epi16(stp1_0, stp1_31);
3154da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[1] = _mm_add_epi16(stp1_1, stp1_30);
3155da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[2] = _mm_add_epi16(stp1_2, stp1_29);
3156da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[3] = _mm_add_epi16(stp1_3, stp1_28);
3157da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[4] = _mm_add_epi16(stp1_4, stp1_27);
3158da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[5] = _mm_add_epi16(stp1_5, stp1_26);
3159da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[6] = _mm_add_epi16(stp1_6, stp1_25);
3160da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[7] = _mm_add_epi16(stp1_7, stp1_24);
3161da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[8] = _mm_add_epi16(stp1_8, stp1_23);
3162da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[9] = _mm_add_epi16(stp1_9, stp1_22);
3163da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[10] = _mm_add_epi16(stp1_10, stp1_21);
3164da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[11] = _mm_add_epi16(stp1_11, stp1_20);
3165da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[12] = _mm_add_epi16(stp1_12, stp1_19);
3166da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[13] = _mm_add_epi16(stp1_13, stp1_18);
3167da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[14] = _mm_add_epi16(stp1_14, stp1_17);
3168da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[15] = _mm_add_epi16(stp1_15, stp1_16);
3169da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
3170da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
3171da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
3172da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
3173da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
3174da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
3175da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
3176da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
3177da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
3178da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
3179da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
3180da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
3181da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
3182da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
3183da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
3184da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
3185da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3186da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    for (j = 0; j < 32; ++j) {
3187da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      // Final rounding and shift
3188da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      in[j] = _mm_adds_epi16(in[j], final_rounding);
3189da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      in[j] = _mm_srai_epi16(in[j], 6);
3190da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      RECON_AND_STORE(dest + j * stride, in[j]);
3191da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
3192da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3193da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dest += 8;
3194da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
3195da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
3196da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
31972263fc984bdc858ee931d3e35c87c404de923950Johannvoid vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
3198da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                 int stride) {
3199da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3200da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
3201da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i zero = _mm_setzero_si128();
3202da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3203da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // idct constants for each stage
3204da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3205da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3206da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
3207da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
3208da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
3209da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
3210da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3211da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3212da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3213da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3214da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
3215da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
3216da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
3217da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
3218da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3219da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3220da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3221da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3222da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3223da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
3224da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
3225da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
3226da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
3227da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3228da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3229da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3230da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3231da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3232da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
3233da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
3234da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3235da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3236da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3237da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3238da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3239da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3240da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3241da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3242da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3243da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
3244da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
3245da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3246da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3247da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3248da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3249da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3250da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3251da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i in[32], col[128], zero_idx[16];
3252da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3253da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3254da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3255da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3256da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          stp1_30, stp1_31;
3257da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3258da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3259da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3260da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3261da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          stp2_30, stp2_31;
3262da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3263da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int i, j, i32;
3264da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3265da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (i = 0; i < 4; i++) {
3266da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    i32 = (i << 5);
3267da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // First 1-D idct
3268da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Load input data.
3269da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[0], input);
3270da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[8], input);
3271da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[16], input);
3272da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[24], input);
3273da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[1], input);
3274da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[9], input);
3275da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[17], input);
3276da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[25], input);
3277da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[2], input);
3278da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[10], input);
3279da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[18], input);
3280da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[26], input);
3281da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[3], input);
3282da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[11], input);
3283da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[19], input);
3284da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[27], input);
3285da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3286da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[4], input);
3287da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[12], input);
3288da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[20], input);
3289da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[28], input);
3290da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[5], input);
3291da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[13], input);
3292da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[21], input);
3293da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[29], input);
3294da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[6], input);
3295da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[14], input);
3296da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[22], input);
3297da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[30], input);
3298da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[7], input);
3299da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[15], input);
3300da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[23], input);
3301da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LOAD_DQCOEFF(in[31], input);
3302da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3303da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // checking if all entries are zero
3304da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    zero_idx[0] = _mm_or_si128(in[0], in[1]);
3305da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    zero_idx[1] = _mm_or_si128(in[2], in[3]);
3306da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    zero_idx[2] = _mm_or_si128(in[4], in[5]);
3307da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    zero_idx[3] = _mm_or_si128(in[6], in[7]);
3308da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    zero_idx[4] = _mm_or_si128(in[8], in[9]);
3309da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    zero_idx[5] = _mm_or_si128(in[10], in[11]);
3310da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    zero_idx[6] = _mm_or_si128(in[12], in[13]);
3311da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    zero_idx[7] = _mm_or_si128(in[14], in[15]);
3312da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    zero_idx[8] = _mm_or_si128(in[16], in[17]);
3313da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    zero_idx[9] = _mm_or_si128(in[18], in[19]);
3314da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    zero_idx[10] = _mm_or_si128(in[20], in[21]);
3315da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    zero_idx[11] = _mm_or_si128(in[22], in[23]);
3316da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    zero_idx[12] = _mm_or_si128(in[24], in[25]);
3317da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    zero_idx[13] = _mm_or_si128(in[26], in[27]);
3318da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    zero_idx[14] = _mm_or_si128(in[28], in[29]);
3319da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    zero_idx[15] = _mm_or_si128(in[30], in[31]);
3320da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3321da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3322da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3323da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3324da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3325da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3326da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3327da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3328da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
3329da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3330da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3331da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3332da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3333da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3334da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3335da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3336da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3337da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3338da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
3339da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 0] = _mm_setzero_si128();
3340da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 1] = _mm_setzero_si128();
3341da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 2] = _mm_setzero_si128();
3342da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 3] = _mm_setzero_si128();
3343da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 4] = _mm_setzero_si128();
3344da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 5] = _mm_setzero_si128();
3345da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 6] = _mm_setzero_si128();
3346da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 7] = _mm_setzero_si128();
3347da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 8] = _mm_setzero_si128();
3348da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 9] = _mm_setzero_si128();
3349da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 10] = _mm_setzero_si128();
3350da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 11] = _mm_setzero_si128();
3351da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 12] = _mm_setzero_si128();
3352da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 13] = _mm_setzero_si128();
3353da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 14] = _mm_setzero_si128();
3354da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 15] = _mm_setzero_si128();
3355da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 16] = _mm_setzero_si128();
3356da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 17] = _mm_setzero_si128();
3357da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 18] = _mm_setzero_si128();
3358da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 19] = _mm_setzero_si128();
3359da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 20] = _mm_setzero_si128();
3360da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 21] = _mm_setzero_si128();
3361da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 22] = _mm_setzero_si128();
3362da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 23] = _mm_setzero_si128();
3363da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 24] = _mm_setzero_si128();
3364da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 25] = _mm_setzero_si128();
3365da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 26] = _mm_setzero_si128();
3366da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 27] = _mm_setzero_si128();
3367da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 28] = _mm_setzero_si128();
3368da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 29] = _mm_setzero_si128();
3369da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 30] = _mm_setzero_si128();
3370da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      col[i32 + 31] = _mm_setzero_si128();
3371da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      continue;
3372da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
3373da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3374da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Transpose 32x8 block to 8x32 block
3375da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    array_transpose_8x8(in, in);
3376da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    array_transpose_8x8(in + 8, in + 8);
3377da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    array_transpose_8x8(in + 16, in + 16);
3378da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    array_transpose_8x8(in + 24, in + 24);
3379da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3380da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    IDCT32
3381da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3382da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // 1_D: Store 32 intermediate results for each 8x32 block.
3383da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
3384da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
3385da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
3386da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
3387da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
3388da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
3389da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
3390da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
3391da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
3392da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
3393da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
3394da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
3395da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
3396da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
3397da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
3398da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
3399da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
3400da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
3401da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
3402da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
3403da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
3404da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
3405da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
3406da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
3407da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
3408da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
3409da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
3410da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
3411da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
3412da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
3413da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
3414da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
3415da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
3416da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (i = 0; i < 4; i++) {
3417da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Second 1-D idct
3418da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    j = i << 3;
3419da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3420da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Transpose 32x8 block to 8x32 block
3421da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    array_transpose_8x8(col + j, in);
3422da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    array_transpose_8x8(col + j + 32, in + 8);
3423da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    array_transpose_8x8(col + j + 64, in + 16);
3424da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    array_transpose_8x8(col + j + 96, in + 24);
3425da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3426da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    IDCT32
3427da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3428da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // 2_D: Calculate the results and store them to destination.
3429da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[0] = _mm_add_epi16(stp1_0, stp1_31);
3430da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[1] = _mm_add_epi16(stp1_1, stp1_30);
3431da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[2] = _mm_add_epi16(stp1_2, stp1_29);
3432da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[3] = _mm_add_epi16(stp1_3, stp1_28);
3433da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[4] = _mm_add_epi16(stp1_4, stp1_27);
3434da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[5] = _mm_add_epi16(stp1_5, stp1_26);
3435da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[6] = _mm_add_epi16(stp1_6, stp1_25);
3436da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[7] = _mm_add_epi16(stp1_7, stp1_24);
3437da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[8] = _mm_add_epi16(stp1_8, stp1_23);
3438da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[9] = _mm_add_epi16(stp1_9, stp1_22);
3439da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[10] = _mm_add_epi16(stp1_10, stp1_21);
3440da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[11] = _mm_add_epi16(stp1_11, stp1_20);
3441da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[12] = _mm_add_epi16(stp1_12, stp1_19);
3442da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[13] = _mm_add_epi16(stp1_13, stp1_18);
3443da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[14] = _mm_add_epi16(stp1_14, stp1_17);
3444da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[15] = _mm_add_epi16(stp1_15, stp1_16);
3445da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
3446da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
3447da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
3448da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
3449da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
3450da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
3451da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
3452da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
3453da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
3454da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
3455da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
3456da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
3457da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
3458da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
3459da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
3460da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
3461da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3462da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    for (j = 0; j < 32; ++j) {
3463da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      // Final rounding and shift
3464da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      in[j] = _mm_adds_epi16(in[j], final_rounding);
3465da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      in[j] = _mm_srai_epi16(in[j], 6);
3466da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      RECON_AND_STORE(dest + j * stride, in[j]);
3467da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
3468da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3469da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dest += 8;
3470da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
3471da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
3472da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
34732263fc984bdc858ee931d3e35c87c404de923950Johannvoid vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
34742263fc984bdc858ee931d3e35c87c404de923950Johann                              int stride) {
3475da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i dc_value;
3476da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i zero = _mm_setzero_si128();
34772263fc984bdc858ee931d3e35c87c404de923950Johann  int a, j;
3478da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3479da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  a = dct_const_round_shift(input[0] * cospi_16_64);
3480da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  a = dct_const_round_shift(a * cospi_16_64);
3481da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  a = ROUND_POWER_OF_TWO(a, 6);
3482da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3483da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  dc_value = _mm_set1_epi16(a);
3484da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
34852263fc984bdc858ee931d3e35c87c404de923950Johann  for (j = 0; j < 32; ++j) {
34862263fc984bdc858ee931d3e35c87c404de923950Johann    RECON_AND_STORE(dest +  0 + j * stride, dc_value);
34872263fc984bdc858ee931d3e35c87c404de923950Johann    RECON_AND_STORE(dest +  8 + j * stride, dc_value);
34882263fc984bdc858ee931d3e35c87c404de923950Johann    RECON_AND_STORE(dest + 16 + j * stride, dc_value);
34892263fc984bdc858ee931d3e35c87c404de923950Johann    RECON_AND_STORE(dest + 24 + j * stride, dc_value);
3490da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
3491da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
3492da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3493da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#if CONFIG_VP9_HIGHBITDEPTH
3494da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
3495da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i ubounded, retval;
3496da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i zero = _mm_set1_epi16(0);
3497da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i one = _mm_set1_epi16(1);
3498da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
3499da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ubounded = _mm_cmpgt_epi16(value, max);
3500da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  retval = _mm_andnot_si128(ubounded, value);
3501da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ubounded = _mm_and_si128(ubounded, max);
3502da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  retval = _mm_or_si128(retval, ubounded);
3503da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
3504da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return retval;
3505da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
3506da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3507da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
3508da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                    int stride, int bd) {
3509da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tran_low_t out[4 * 4];
3510da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tran_low_t *outptr = out;
3511da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int i, j;
3512da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i inptr[4];
3513da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i sign_bits[2];
3514da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i temp_mm, min_input, max_input;
3515da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int test;
3516da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3517da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int optimised_cols = 0;
3518da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i zero = _mm_set1_epi16(0);
3519da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i eight = _mm_set1_epi16(8);
3520da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i max = _mm_set1_epi16(12043);
3521da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i min = _mm_set1_epi16(-12043);
3522da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Load input into __m128i
3523da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  inptr[0] = _mm_loadu_si128((const __m128i *)input);
3524da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
3525da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
3526da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
3527da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3528da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Pack to 16 bits
3529da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
3530da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
3531da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3532da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  max_input = _mm_max_epi16(inptr[0], inptr[1]);
3533da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  min_input = _mm_min_epi16(inptr[0], inptr[1]);
3534da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  max_input = _mm_cmpgt_epi16(max_input, max);
3535da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  min_input = _mm_cmplt_epi16(min_input, min);
3536da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  temp_mm = _mm_or_si128(max_input, min_input);
3537da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  test = _mm_movemask_epi8(temp_mm);
3538da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3539da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  if (!test) {
3540da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Do the row transform
3541da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    idct4_sse2(inptr);
3542da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3543da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Check the min & max values
3544da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    max_input = _mm_max_epi16(inptr[0], inptr[1]);
3545da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    min_input = _mm_min_epi16(inptr[0], inptr[1]);
3546da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    max_input = _mm_cmpgt_epi16(max_input, max);
3547da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    min_input = _mm_cmplt_epi16(min_input, min);
3548da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    temp_mm = _mm_or_si128(max_input, min_input);
3549da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    test = _mm_movemask_epi8(temp_mm);
3550da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3551da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    if (test) {
3552da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      transpose_4x4(inptr);
3553da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
3554da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
3555da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
3556da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
3557da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
3558da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
3559da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      _mm_storeu_si128((__m128i *)outptr, inptr[0]);
3560da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
3561da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
3562da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
3563da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    } else {
3564da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      // Set to use the optimised transform for the column
3565da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      optimised_cols = 1;
3566da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
3567da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } else {
3568da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Run the un-optimised row transform
3569da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    for (i = 0; i < 4; ++i) {
3570da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      vpx_highbd_idct4_c(input, outptr, bd);
3571da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      input += 4;
3572da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      outptr += 4;
3573da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
3574da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
3575da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3576da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  if (optimised_cols) {
3577da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    idct4_sse2(inptr);
3578da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3579da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Final round and shift
3580da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    inptr[0] = _mm_add_epi16(inptr[0], eight);
3581da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    inptr[1] = _mm_add_epi16(inptr[1], eight);
3582da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3583da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    inptr[0] = _mm_srai_epi16(inptr[0], 4);
3584da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    inptr[1] = _mm_srai_epi16(inptr[1], 4);
3585da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3586da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Reconstruction and Store
3587da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
3588da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
3589da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
3590da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d0 = _mm_unpacklo_epi64(
3591da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
3592da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d2 = _mm_unpacklo_epi64(
3593da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
3594da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
3595da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
3596da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      // store input0
3597da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      _mm_storel_epi64((__m128i *)dest, d0);
3598da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      // store input1
3599da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d0 = _mm_srli_si128(d0, 8);
3600da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      _mm_storel_epi64((__m128i *)(dest + stride), d0);
3601da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      // store input2
3602da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
3603da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      // store input3
3604da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d2 = _mm_srli_si128(d2, 8);
3605da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
3606da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
3607da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } else {
3608da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Run the un-optimised column transform
3609da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tran_low_t temp_in[4], temp_out[4];
3610da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Columns
3611da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    for (i = 0; i < 4; ++i) {
3612da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      for (j = 0; j < 4; ++j)
3613da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        temp_in[j] = out[j * 4 + i];
3614da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      vpx_highbd_idct4_c(temp_in, temp_out, bd);
3615da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      for (j = 0; j < 4; ++j) {
3616da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        dest[j * stride + i] = highbd_clip_pixel_add(
3617da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
3618da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      }
3619da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
3620da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
3621da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
3622da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3623da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
3624da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                    int stride, int bd) {
3625da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tran_low_t out[8 * 8];
3626da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tran_low_t *outptr = out;
3627da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int i, j, test;
3628da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i inptr[8];
3629da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i min_input, max_input, temp1, temp2, sign_bits;
3630da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3631da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i zero = _mm_set1_epi16(0);
3632da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i sixteen = _mm_set1_epi16(16);
3633da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i max = _mm_set1_epi16(6201);
3634da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i min = _mm_set1_epi16(-6201);
3635da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int optimised_cols = 0;
3636da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3637da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Load input into __m128i & pack to 16 bits
3638da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (i = 0; i < 8; i++) {
3639da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
3640da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
3641da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    inptr[i] = _mm_packs_epi32(temp1, temp2);
3642da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
3643da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3644da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Find the min & max for the row transform
3645da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  max_input = _mm_max_epi16(inptr[0], inptr[1]);
3646da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  min_input = _mm_min_epi16(inptr[0], inptr[1]);
3647da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (i = 2; i < 8; i++) {
3648da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    max_input = _mm_max_epi16(max_input, inptr[i]);
3649da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    min_input = _mm_min_epi16(min_input, inptr[i]);
3650da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
3651da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  max_input = _mm_cmpgt_epi16(max_input, max);
3652da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  min_input = _mm_cmplt_epi16(min_input, min);
3653da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  temp1 = _mm_or_si128(max_input, min_input);
3654da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  test = _mm_movemask_epi8(temp1);
3655da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3656da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  if (!test) {
3657da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Do the row transform
3658da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    idct8_sse2(inptr);
3659da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3660da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Find the min & max for the column transform
3661da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    max_input = _mm_max_epi16(inptr[0], inptr[1]);
3662da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    min_input = _mm_min_epi16(inptr[0], inptr[1]);
3663da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    for (i = 2; i < 8; i++) {
3664da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      max_input = _mm_max_epi16(max_input, inptr[i]);
3665da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      min_input = _mm_min_epi16(min_input, inptr[i]);
3666da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
3667da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    max_input = _mm_cmpgt_epi16(max_input, max);
3668da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    min_input = _mm_cmplt_epi16(min_input, min);
3669da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    temp1 = _mm_or_si128(max_input, min_input);
3670da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    test = _mm_movemask_epi8(temp1);
3671da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3672da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    if (test) {
3673da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      array_transpose_8x8(inptr, inptr);
3674da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      for (i = 0; i < 8; i++) {
3675da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
3676da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
3677da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
3678da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
3679da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
3680da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      }
3681da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    } else {
3682da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      // Set to use the optimised transform for the column
3683da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      optimised_cols = 1;
3684da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
3685da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } else {
3686da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Run the un-optimised row transform
3687da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    for (i = 0; i < 8; ++i) {
3688da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      vpx_highbd_idct8_c(input, outptr, bd);
3689da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      input += 8;
3690da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      outptr += 8;
3691da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
3692da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
3693da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3694da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  if (optimised_cols) {
3695da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    idct8_sse2(inptr);
3696da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3697da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Final round & shift and Reconstruction and Store
3698da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
3699da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      __m128i d[8];
3700da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      for (i = 0; i < 8; i++) {
3701da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
3702da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
3703da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        inptr[i] = _mm_srai_epi16(inptr[i], 5);
3704da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
3705da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        // Store
3706da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
3707da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      }
3708da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
3709da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } else {
3710da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Run the un-optimised column transform
3711da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tran_low_t temp_in[8], temp_out[8];
3712da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    for (i = 0; i < 8; ++i) {
3713da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      for (j = 0; j < 8; ++j)
3714da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        temp_in[j] = out[j * 8 + i];
3715da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      vpx_highbd_idct8_c(temp_in, temp_out, bd);
3716da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      for (j = 0; j < 8; ++j) {
3717da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        dest[j * stride + i] = highbd_clip_pixel_add(
3718da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
3719da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      }
3720da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
3721da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
3722da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
3723da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3724da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
3725da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                    int stride, int bd) {
3726da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tran_low_t out[8 * 8] = { 0 };
3727da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tran_low_t *outptr = out;
3728da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int i, j, test;
3729da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i inptr[8];
3730da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i min_input, max_input, temp1, temp2, sign_bits;
3731da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3732da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i zero = _mm_set1_epi16(0);
3733da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i sixteen = _mm_set1_epi16(16);
3734da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i max = _mm_set1_epi16(6201);
3735da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i min = _mm_set1_epi16(-6201);
3736da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int optimised_cols = 0;
3737da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3738da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Load input into __m128i & pack to 16 bits
3739da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (i = 0; i < 8; i++) {
3740da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
3741da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
3742da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    inptr[i] = _mm_packs_epi32(temp1, temp2);
3743da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
3744da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3745da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Find the min & max for the row transform
3746da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // only first 4 row has non-zero coefs
3747da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  max_input = _mm_max_epi16(inptr[0], inptr[1]);
3748da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  min_input = _mm_min_epi16(inptr[0], inptr[1]);
3749da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (i = 2; i < 4; i++) {
3750da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    max_input = _mm_max_epi16(max_input, inptr[i]);
3751da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    min_input = _mm_min_epi16(min_input, inptr[i]);
3752da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
3753da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  max_input = _mm_cmpgt_epi16(max_input, max);
3754da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  min_input = _mm_cmplt_epi16(min_input, min);
3755da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  temp1 = _mm_or_si128(max_input, min_input);
3756da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  test = _mm_movemask_epi8(temp1);
3757da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3758da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  if (!test) {
3759da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Do the row transform
3760da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    idct8_sse2(inptr);
3761da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3762da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Find the min & max for the column transform
3763da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // N.B. Only first 4 cols contain non-zero coeffs
3764da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    max_input = _mm_max_epi16(inptr[0], inptr[1]);
3765da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    min_input = _mm_min_epi16(inptr[0], inptr[1]);
3766da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    for (i = 2; i < 8; i++) {
3767da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      max_input = _mm_max_epi16(max_input, inptr[i]);
3768da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      min_input = _mm_min_epi16(min_input, inptr[i]);
3769da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
3770da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    max_input = _mm_cmpgt_epi16(max_input, max);
3771da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    min_input = _mm_cmplt_epi16(min_input, min);
3772da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    temp1 = _mm_or_si128(max_input, min_input);
3773da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    test = _mm_movemask_epi8(temp1);
3774da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3775da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    if (test) {
3776da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      // Use fact only first 4 rows contain non-zero coeffs
3777da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      array_transpose_4X8(inptr, inptr);
3778da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      for (i = 0; i < 4; i++) {
3779da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
3780da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
3781da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
3782da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
3783da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
3784da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      }
3785da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    } else {
3786da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      // Set to use the optimised transform for the column
3787da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      optimised_cols = 1;
3788da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
3789da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } else {
3790da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Run the un-optimised row transform
3791da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    for (i = 0; i < 4; ++i) {
3792da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      vpx_highbd_idct8_c(input, outptr, bd);
3793da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      input += 8;
3794da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      outptr += 8;
3795da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
3796da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
3797da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3798da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  if (optimised_cols) {
3799da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    idct8_sse2(inptr);
3800da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3801da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Final round & shift and Reconstruction and Store
3802da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
3803da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      __m128i d[8];
3804da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      for (i = 0; i < 8; i++) {
3805da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
3806da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
3807da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        inptr[i] = _mm_srai_epi16(inptr[i], 5);
3808da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
3809da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        // Store
3810da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
3811da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      }
3812da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
3813da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } else {
3814da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Run the un-optimised column transform
3815da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tran_low_t temp_in[8], temp_out[8];
3816da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    for (i = 0; i < 8; ++i) {
3817da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      for (j = 0; j < 8; ++j)
3818da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        temp_in[j] = out[j * 8 + i];
3819da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      vpx_highbd_idct8_c(temp_in, temp_out, bd);
3820da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      for (j = 0; j < 8; ++j) {
3821da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        dest[j * stride + i] = highbd_clip_pixel_add(
3822da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
3823da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      }
3824da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
3825da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
3826da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
3827da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3828da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
3829da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                       int stride, int bd) {
3830da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tran_low_t out[16 * 16];
3831da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tran_low_t *outptr = out;
3832da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int i, j, test;
3833da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i inptr[32];
3834da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i min_input, max_input, temp1, temp2, sign_bits;
3835da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3836da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i zero = _mm_set1_epi16(0);
3837da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i rounding = _mm_set1_epi16(32);
3838da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i max = _mm_set1_epi16(3155);
3839da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i min = _mm_set1_epi16(-3155);
3840da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int optimised_cols = 0;
3841da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3842da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Load input into __m128i & pack to 16 bits
3843da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (i = 0; i < 16; i++) {
3844da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
3845da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
3846da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    inptr[i] = _mm_packs_epi32(temp1, temp2);
3847da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
3848da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
3849da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
3850da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
3851da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3852da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Find the min & max for the row transform
3853da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  max_input = _mm_max_epi16(inptr[0], inptr[1]);
3854da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  min_input = _mm_min_epi16(inptr[0], inptr[1]);
3855da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (i = 2; i < 32; i++) {
3856da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    max_input = _mm_max_epi16(max_input, inptr[i]);
3857da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    min_input = _mm_min_epi16(min_input, inptr[i]);
3858da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
3859da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  max_input = _mm_cmpgt_epi16(max_input, max);
3860da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  min_input = _mm_cmplt_epi16(min_input, min);
3861da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  temp1 = _mm_or_si128(max_input, min_input);
3862da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  test = _mm_movemask_epi8(temp1);
3863da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3864da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  if (!test) {
3865da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Do the row transform
3866da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    idct16_sse2(inptr, inptr + 16);
3867da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3868da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Find the min & max for the column transform
3869da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    max_input = _mm_max_epi16(inptr[0], inptr[1]);
3870da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    min_input = _mm_min_epi16(inptr[0], inptr[1]);
3871da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    for (i = 2; i < 32; i++) {
3872da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      max_input = _mm_max_epi16(max_input, inptr[i]);
3873da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      min_input = _mm_min_epi16(min_input, inptr[i]);
3874da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
3875da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    max_input = _mm_cmpgt_epi16(max_input, max);
3876da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    min_input = _mm_cmplt_epi16(min_input, min);
3877da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    temp1 = _mm_or_si128(max_input, min_input);
3878da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    test = _mm_movemask_epi8(temp1);
3879da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3880da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    if (test) {
3881da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      array_transpose_16x16(inptr, inptr + 16);
3882da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      for (i = 0; i < 16; i++) {
3883da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
3884da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
3885da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
3886da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
3887da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
3888da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
3889da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
3890da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
3891da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
3892da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
3893da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      }
3894da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    } else {
3895da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      // Set to use the optimised transform for the column
3896da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      optimised_cols = 1;
3897da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
3898da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } else {
3899da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Run the un-optimised row transform
3900da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    for (i = 0; i < 16; ++i) {
3901da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      vpx_highbd_idct16_c(input, outptr, bd);
3902da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      input += 16;
3903da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      outptr += 16;
3904da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
3905da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
3906da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3907da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  if (optimised_cols) {
3908da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    idct16_sse2(inptr, inptr + 16);
3909da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3910da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Final round & shift and Reconstruction and Store
3911da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
3912da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      __m128i d[2];
3913da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      for (i = 0; i < 16; i++) {
3914da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);
3915da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
3916da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
3917da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
3918da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);
3919da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
3920da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);
3921da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
3922da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        // Store
3923da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
3924da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
3925da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      }
3926da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
3927da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } else {
3928da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Run the un-optimised column transform
3929da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tran_low_t temp_in[16], temp_out[16];
3930da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    for (i = 0; i < 16; ++i) {
3931da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      for (j = 0; j < 16; ++j)
3932da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        temp_in[j] = out[j * 16 + i];
3933da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      vpx_highbd_idct16_c(temp_in, temp_out, bd);
3934da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      for (j = 0; j < 16; ++j) {
3935da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        dest[j * stride + i] = highbd_clip_pixel_add(
3936da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
3937da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      }
3938da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
3939da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
3940da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
3941da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3942da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
3943da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      int stride, int bd) {
3944da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tran_low_t out[16 * 16] = { 0 };
3945da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tran_low_t *outptr = out;
3946da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int i, j, test;
3947da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i inptr[32];
3948da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i min_input, max_input, temp1, temp2, sign_bits;
3949da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3950da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i zero = _mm_set1_epi16(0);
3951da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i rounding = _mm_set1_epi16(32);
3952da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i max = _mm_set1_epi16(3155);
3953da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const __m128i min = _mm_set1_epi16(-3155);
3954da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int optimised_cols = 0;
3955da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3956da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Load input into __m128i & pack to 16 bits
3957da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (i = 0; i < 16; i++) {
3958da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
3959da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
3960da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    inptr[i] = _mm_packs_epi32(temp1, temp2);
3961da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
3962da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
3963da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
3964da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
3965da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3966da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Find the min & max for the row transform
3967da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // Since all non-zero dct coefficients are in upper-left 4x4 area,
3968da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  // we only need to consider first 4 rows here.
3969da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  max_input = _mm_max_epi16(inptr[0], inptr[1]);
3970da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  min_input = _mm_min_epi16(inptr[0], inptr[1]);
3971da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (i = 2; i < 4; i++) {
3972da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    max_input = _mm_max_epi16(max_input, inptr[i]);
3973da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    min_input = _mm_min_epi16(min_input, inptr[i]);
3974da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
3975da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  max_input = _mm_cmpgt_epi16(max_input, max);
3976da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  min_input = _mm_cmplt_epi16(min_input, min);
3977da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  temp1 = _mm_or_si128(max_input, min_input);
3978da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  test = _mm_movemask_epi8(temp1);
3979da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3980da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  if (!test) {
3981da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Do the row transform (N.B. This transposes inptr)
3982da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    idct16_sse2(inptr, inptr + 16);
3983da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3984da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Find the min & max for the column transform
3985da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // N.B. Only first 4 cols contain non-zero coeffs
3986da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    max_input = _mm_max_epi16(inptr[0], inptr[1]);
3987da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    min_input = _mm_min_epi16(inptr[0], inptr[1]);
3988da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    for (i = 2; i < 16; i++) {
3989da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      max_input = _mm_max_epi16(max_input, inptr[i]);
3990da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      min_input = _mm_min_epi16(min_input, inptr[i]);
3991da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
3992da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    max_input = _mm_cmpgt_epi16(max_input, max);
3993da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    min_input = _mm_cmplt_epi16(min_input, min);
3994da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    temp1 = _mm_or_si128(max_input, min_input);
3995da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    test = _mm_movemask_epi8(temp1);
3996da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3997da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    if (test) {
3998da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      // Use fact only first 4 rows contain non-zero coeffs
3999da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      array_transpose_8x8(inptr, inptr);
4000da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      array_transpose_8x8(inptr + 8, inptr + 16);
4001da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      for (i = 0; i < 4; i++) {
4002da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
4003da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
4004da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
4005da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
4006da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
4007da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
4008da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
4009da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
4010da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
4011da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
4012da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      }
4013da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    } else {
4014da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      // Set to use the optimised transform for the column
4015da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      optimised_cols = 1;
4016da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
4017da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } else {
4018da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Run the un-optimised row transform
4019da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    for (i = 0; i < 4; ++i) {
4020da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      vpx_highbd_idct16_c(input, outptr, bd);
4021da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      input += 16;
4022da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      outptr += 16;
4023da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
4024da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
4025da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
4026da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  if (optimised_cols) {
4027da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    idct16_sse2(inptr, inptr + 16);
4028da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
4029da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Final round & shift and Reconstruction and Store
4030da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
4031da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      __m128i d[2];
4032da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      for (i = 0; i < 16; i++) {
4033da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);
4034da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
4035da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
4036da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
4037da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);
4038da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
4039da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);
4040da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
4041da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        // Store
4042da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
4043da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
4044da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      }
4045da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
4046da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } else {
4047da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Run the un-optimised column transform
4048da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tran_low_t temp_in[16], temp_out[16];
4049da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    for (i = 0; i < 16; ++i) {
4050da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      for (j = 0; j < 16; ++j)
4051da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        temp_in[j] = out[j * 16 + i];
4052da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      vpx_highbd_idct16_c(temp_in, temp_out, bd);
4053da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      for (j = 0; j < 16; ++j) {
4054da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        dest[j * stride + i] = highbd_clip_pixel_add(
4055da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
4056da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      }
4057da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
4058da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
4059da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
4060da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#endif  // CONFIG_VP9_HIGHBITDEPTH
4061