vp9_dct_sse2.c revision 91037db265ecdd914a26e056cf69207b4f50924e
1ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang/* 2ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * 4ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * Use of this source code is governed by a BSD-style license 5ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * that can be found in the LICENSE file in the root of the source 6ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * tree. An additional intellectual property rights grant can be found 7ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * in the file PATENTS. All contributing project authors may 8ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * be found in the AUTHORS file in the root of the source tree. 9ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang */ 10ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 11ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include <emmintrin.h> // SSE2 12ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp9/common/vp9_idct.h" // for cospi constants 1391037db265ecdd914a26e056cf69207b4f50924ehkuang#include "vpx_ports/mem.h" 14ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 15ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangvoid vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) { 16ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // The 2D transform is done with two passes which are actually pretty 17ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // similar. In the first one, we transform the columns and transpose 18ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // the results. In the second one, we transform the rows. To achieve that, 19ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // as the first pass results are transposed, we tranpose the columns (that 20ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // is the transposed rows) and transpose the results (so that it goes back 21ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // in normal/row positions). 22ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const int stride = pitch >> 1; 23ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang int pass; 24ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Constants 25ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // When we use them, in one case, they are all the same. In all others 26ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // it's a pair of them that we need to repeat four times. This is done 27ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // by constructing the 32 bit constant corresponding to that pair. 28ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 29ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 30ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 31ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 32ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 33ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); 34ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); 35ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i kOne = _mm_set1_epi16(1); 36ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i in0, in1, in2, in3; 37ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Load inputs. 38ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 39ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); 40ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); 41ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); 42ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); 43ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // x = x << 4 44ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_slli_epi16(in0, 4); 45ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_slli_epi16(in1, 4); 46ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_slli_epi16(in2, 4); 47ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_slli_epi16(in3, 4); 48ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // if (i == 0 && input[0]) input[0] += 1; 49ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 50ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // The mask will only contain wether the first value is zero, all 51ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // other comparison will fail as something shifted by 4 (above << 4) 52ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // can never be equal to one. To increment in the non-zero case, we 53ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // add the mask and one for the first element: 54ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // - if zero, mask = -1, v = v - 1 + 1 = v 55ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 56ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a); 57ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_add_epi16(in0, mask); 58ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_add_epi16(in0, k__nonzero_bias_b); 59ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 60ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 61ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Do the two transform/transpose passes 62ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang for (pass = 0; pass < 2; ++pass) { 63ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Transform 1/2: Add/substract 64ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i r0 = _mm_add_epi16(in0, in3); 65ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i r1 = _mm_add_epi16(in1, in2); 66ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i r2 = _mm_sub_epi16(in1, in2); 67ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i r3 = _mm_sub_epi16(in0, in3); 68ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Transform 1/2: Interleave to do the multiply by constants which gets us 69ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // into 32 bits. 70ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t0 = _mm_unpacklo_epi16(r0, r1); 71ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t2 = _mm_unpacklo_epi16(r2, r3); 72ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); 73ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); 74ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); 75ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); 76ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 77ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 78ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 79ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 80ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 81ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 82ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 83ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 84ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Combine and transpose 85ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i res0 = _mm_packs_epi32(w0, w2); 86ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i res1 = _mm_packs_epi32(w4, w6); 87ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 00 01 02 03 20 21 22 23 88ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 10 11 12 13 30 31 32 33 89ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); 90ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1); 91ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 00 10 01 11 02 12 03 13 92ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 20 30 21 31 22 32 23 33 93ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 94ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 95ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 00 10 20 30 01 11 21 31 in0 contains 0 followed by 1 96ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 02 12 22 32 03 13 23 33 in2 contains 2 followed by 3 97ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang if (0 == pass) { 98ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Extract values in the high part for second pass as transform code 99ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // only uses the first four values. 100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_unpackhi_epi64(in0, in0); 101ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_unpackhi_epi64(in2, in2); 102ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } else { 103ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Post-condition output and store it (v + 1) >> 2, taking advantage 104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // of the fact 1/3 are stored just after 0/2. 105ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i out01 = _mm_add_epi16(in0, kOne); 106ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i out23 = _mm_add_epi16(in2, kOne); 107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out01 = _mm_srai_epi16(out01, 2); 108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out23 = _mm_srai_epi16(out23, 2); 109ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang _mm_storeu_si128((__m128i *)(output + 0 * 4), out01); 110ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang _mm_storeu_si128((__m128i *)(output + 2 * 4), out23); 111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 113ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang} 114ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 115ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangvoid vp9_short_fdct8x4_sse2(int16_t *input, int16_t *output, int pitch) { 116ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang vp9_short_fdct4x4_sse2(input, output, pitch); 117ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang vp9_short_fdct4x4_sse2(input + 4, output + 16, pitch); 118ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang} 119ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 12091037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void load_buffer_4x4(int16_t *input, __m128i *in, int stride) { 12191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); 12291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); 12391037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i mask; 12491037db265ecdd914a26e056cf69207b4f50924ehkuang 12591037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); 12691037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); 12791037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); 12891037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); 12991037db265ecdd914a26e056cf69207b4f50924ehkuang 13091037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_slli_epi16(in[0], 4); 13191037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_slli_epi16(in[1], 4); 13291037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = _mm_slli_epi16(in[2], 4); 13391037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_slli_epi16(in[3], 4); 13491037db265ecdd914a26e056cf69207b4f50924ehkuang 13591037db265ecdd914a26e056cf69207b4f50924ehkuang mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a); 13691037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_add_epi16(in[0], mask); 13791037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b); 13891037db265ecdd914a26e056cf69207b4f50924ehkuang} 13991037db265ecdd914a26e056cf69207b4f50924ehkuang 14091037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void write_buffer_4x4(int16_t *output, __m128i *res) { 14191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i kOne = _mm_set1_epi16(1); 14291037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]); 14391037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]); 14491037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i out01 = _mm_add_epi16(in01, kOne); 14591037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i out23 = _mm_add_epi16(in23, kOne); 14691037db265ecdd914a26e056cf69207b4f50924ehkuang out01 = _mm_srai_epi16(out01, 2); 14791037db265ecdd914a26e056cf69207b4f50924ehkuang out23 = _mm_srai_epi16(out23, 2); 14891037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_store_si128((__m128i *)(output + 0 * 8), out01); 14991037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_store_si128((__m128i *)(output + 1 * 8), out23); 15091037db265ecdd914a26e056cf69207b4f50924ehkuang} 15191037db265ecdd914a26e056cf69207b4f50924ehkuang 15291037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void transpose_4x4(__m128i *res) { 15391037db265ecdd914a26e056cf69207b4f50924ehkuang // Combine and transpose 15491037db265ecdd914a26e056cf69207b4f50924ehkuang // 00 01 02 03 20 21 22 23 15591037db265ecdd914a26e056cf69207b4f50924ehkuang // 10 11 12 13 30 31 32 33 15691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); 15791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); 15891037db265ecdd914a26e056cf69207b4f50924ehkuang 15991037db265ecdd914a26e056cf69207b4f50924ehkuang // 00 10 01 11 02 12 03 13 16091037db265ecdd914a26e056cf69207b4f50924ehkuang // 20 30 21 31 22 32 23 33 16191037db265ecdd914a26e056cf69207b4f50924ehkuang res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); 16291037db265ecdd914a26e056cf69207b4f50924ehkuang res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); 16391037db265ecdd914a26e056cf69207b4f50924ehkuang 16491037db265ecdd914a26e056cf69207b4f50924ehkuang // 00 10 20 30 01 11 21 31 16591037db265ecdd914a26e056cf69207b4f50924ehkuang // 02 12 22 32 03 13 23 33 16691037db265ecdd914a26e056cf69207b4f50924ehkuang // only use the first 4 16-bit integers 16791037db265ecdd914a26e056cf69207b4f50924ehkuang res[1] = _mm_unpackhi_epi64(res[0], res[0]); 16891037db265ecdd914a26e056cf69207b4f50924ehkuang res[3] = _mm_unpackhi_epi64(res[2], res[2]); 16991037db265ecdd914a26e056cf69207b4f50924ehkuang} 17091037db265ecdd914a26e056cf69207b4f50924ehkuang 17191037db265ecdd914a26e056cf69207b4f50924ehkuangvoid fdct4_1d_sse2(__m128i *in) { 17291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 17391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 17491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 17591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 17691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 17791037db265ecdd914a26e056cf69207b4f50924ehkuang 17891037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i u[4], v[4]; 17991037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi16(in[0], in[3]); 18091037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi16(in[1], in[2]); 18191037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_sub_epi16(in[1], in[2]); 18291037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_sub_epi16(in[0], in[3]); 18391037db265ecdd914a26e056cf69207b4f50924ehkuang 18491037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_unpacklo_epi16(u[0], u[1]); 18591037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_unpacklo_epi16(u[2], u[3]); 18691037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0 18791037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2 18891037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_madd_epi16(v[1], k__cospi_p24_p08); // 1 18991037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_madd_epi16(v[1], k__cospi_m08_p24); // 3 19091037db265ecdd914a26e056cf69207b4f50924ehkuang 19191037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 19291037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 19391037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 19491037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 19591037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 19691037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 19791037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 19891037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 19991037db265ecdd914a26e056cf69207b4f50924ehkuang 20091037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_packs_epi32(u[0], u[1]); 20191037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_packs_epi32(u[2], u[3]); 20291037db265ecdd914a26e056cf69207b4f50924ehkuang transpose_4x4(in); 20391037db265ecdd914a26e056cf69207b4f50924ehkuang} 20491037db265ecdd914a26e056cf69207b4f50924ehkuang 20591037db265ecdd914a26e056cf69207b4f50924ehkuangvoid fadst4_1d_sse2(__m128i *in) { 20691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9); 20791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9); 20891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9); 20991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9); 21091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9); 21191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i kZero = _mm_set1_epi16(0); 21291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 21391037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i u[8], v[8]; 21491037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in7 = _mm_add_epi16(in[0], in[1]); 21591037db265ecdd914a26e056cf69207b4f50924ehkuang in7 = _mm_sub_epi16(in7, in[3]); 21691037db265ecdd914a26e056cf69207b4f50924ehkuang 21791037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_unpacklo_epi16(in[0], in[1]); 21891037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_unpacklo_epi16(in[2], in[3]); 21991037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_unpacklo_epi16(in7, kZero); 22091037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_unpacklo_epi16(in[2], kZero); 22191037db265ecdd914a26e056cf69207b4f50924ehkuang 22291037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2 22391037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5 22491037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1 22591037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3 22691037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6 22791037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4 22891037db265ecdd914a26e056cf69207b4f50924ehkuang 22991037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], v[1]); 23091037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = v[2]; 23191037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(v[3], v[4]); 23291037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_sub_epi32(u[2], u[0]); 23391037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_slli_epi32(v[5], 2); 23491037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_sub_epi32(u[4], v[5]); 23591037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_add_epi32(u[3], u[5]); 23691037db265ecdd914a26e056cf69207b4f50924ehkuang 23791037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 23891037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 23991037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 24091037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 24191037db265ecdd914a26e056cf69207b4f50924ehkuang 24291037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 24391037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 24491037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 24591037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 24691037db265ecdd914a26e056cf69207b4f50924ehkuang 24791037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_packs_epi32(u[0], u[2]); 24891037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_packs_epi32(u[1], u[3]); 24991037db265ecdd914a26e056cf69207b4f50924ehkuang transpose_4x4(in); 25091037db265ecdd914a26e056cf69207b4f50924ehkuang} 25191037db265ecdd914a26e056cf69207b4f50924ehkuang 25291037db265ecdd914a26e056cf69207b4f50924ehkuangvoid vp9_short_fht4x4_sse2(int16_t *input, int16_t *output, 25391037db265ecdd914a26e056cf69207b4f50924ehkuang int stride, int tx_type) { 25491037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in[4]; 25591037db265ecdd914a26e056cf69207b4f50924ehkuang load_buffer_4x4(input, in, stride); 25691037db265ecdd914a26e056cf69207b4f50924ehkuang switch (tx_type) { 25791037db265ecdd914a26e056cf69207b4f50924ehkuang case 0: // DCT_DCT 25891037db265ecdd914a26e056cf69207b4f50924ehkuang fdct4_1d_sse2(in); 25991037db265ecdd914a26e056cf69207b4f50924ehkuang fdct4_1d_sse2(in); 26091037db265ecdd914a26e056cf69207b4f50924ehkuang break; 26191037db265ecdd914a26e056cf69207b4f50924ehkuang case 1: // ADST_DCT 26291037db265ecdd914a26e056cf69207b4f50924ehkuang fadst4_1d_sse2(in); 26391037db265ecdd914a26e056cf69207b4f50924ehkuang fdct4_1d_sse2(in); 26491037db265ecdd914a26e056cf69207b4f50924ehkuang break; 26591037db265ecdd914a26e056cf69207b4f50924ehkuang case 2: // DCT_ADST 26691037db265ecdd914a26e056cf69207b4f50924ehkuang fdct4_1d_sse2(in); 26791037db265ecdd914a26e056cf69207b4f50924ehkuang fadst4_1d_sse2(in); 26891037db265ecdd914a26e056cf69207b4f50924ehkuang break; 26991037db265ecdd914a26e056cf69207b4f50924ehkuang case 3: // ADST_ADST 27091037db265ecdd914a26e056cf69207b4f50924ehkuang fadst4_1d_sse2(in); 27191037db265ecdd914a26e056cf69207b4f50924ehkuang fadst4_1d_sse2(in); 27291037db265ecdd914a26e056cf69207b4f50924ehkuang break; 27391037db265ecdd914a26e056cf69207b4f50924ehkuang default: 27491037db265ecdd914a26e056cf69207b4f50924ehkuang assert(0); 27591037db265ecdd914a26e056cf69207b4f50924ehkuang break; 27691037db265ecdd914a26e056cf69207b4f50924ehkuang } 27791037db265ecdd914a26e056cf69207b4f50924ehkuang write_buffer_4x4(output, in); 27891037db265ecdd914a26e056cf69207b4f50924ehkuang} 27991037db265ecdd914a26e056cf69207b4f50924ehkuang 280ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangvoid vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) { 281ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const int stride = pitch >> 1; 282ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang int pass; 283ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Constants 284ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // When we use them, in one case, they are all the same. In all others 285ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // it's a pair of them that we need to repeat four times. This is done 286ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // by constructing the 32 bit constant corresponding to that pair. 287ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 288ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 289ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 290ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 291ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 292ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 293ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 294ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 295ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 296ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Load input 29791037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); 29891037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); 29991037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); 30091037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); 30191037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); 30291037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); 30391037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); 30491037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); 305ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Pre-condition input (shift by two) 306ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_slli_epi16(in0, 2); 307ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_slli_epi16(in1, 2); 308ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_slli_epi16(in2, 2); 309ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_slli_epi16(in3, 2); 310ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4 = _mm_slli_epi16(in4, 2); 311ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in5 = _mm_slli_epi16(in5, 2); 312ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in6 = _mm_slli_epi16(in6, 2); 313ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in7 = _mm_slli_epi16(in7, 2); 314ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 315ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // We do two passes, first the columns, then the rows. The results of the 316ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // first pass are transposed so that the same column code can be reused. The 317ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // results of the second pass are also transposed so that the rows (processed 318ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // as columns) are put back in row positions. 319ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang for (pass = 0; pass < 2; pass++) { 320ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // To store results of each pass before the transpose. 321ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i res0, res1, res2, res3, res4, res5, res6, res7; 322ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Add/substract 323ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i q0 = _mm_add_epi16(in0, in7); 324ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i q1 = _mm_add_epi16(in1, in6); 325ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i q2 = _mm_add_epi16(in2, in5); 326ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i q3 = _mm_add_epi16(in3, in4); 327ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i q4 = _mm_sub_epi16(in3, in4); 328ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i q5 = _mm_sub_epi16(in2, in5); 329ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i q6 = _mm_sub_epi16(in1, in6); 330ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i q7 = _mm_sub_epi16(in0, in7); 331ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Work on first four results 332ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 333ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Add/substract 334ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i r0 = _mm_add_epi16(q0, q3); 335ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i r1 = _mm_add_epi16(q1, q2); 336ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i r2 = _mm_sub_epi16(q1, q2); 337ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i r3 = _mm_sub_epi16(q0, q3); 338ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Interleave to do the multiply by constants which gets us into 32bits 339ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t0 = _mm_unpacklo_epi16(r0, r1); 340ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t1 = _mm_unpackhi_epi16(r0, r1); 341ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t2 = _mm_unpacklo_epi16(r2, r3); 342ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t3 = _mm_unpackhi_epi16(r2, r3); 343ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); 344ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); 345ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); 346ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); 347ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); 348ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); 349ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); 350ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); 351ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // dct_const_round_shift 352ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 353ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 354ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 355ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 356ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 357ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 358ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 359ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 360ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 361ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 362ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 363ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 364ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 365ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 366ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 367ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 368ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Combine 369ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res0 = _mm_packs_epi32(w0, w1); 370ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res4 = _mm_packs_epi32(w2, w3); 371ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res2 = _mm_packs_epi32(w4, w5); 372ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res6 = _mm_packs_epi32(w6, w7); 373ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 374ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Work on next four results 375ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 376ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Interleave to do the multiply by constants which gets us into 32bits 377ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i d0 = _mm_unpacklo_epi16(q6, q5); 378ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i d1 = _mm_unpackhi_epi16(q6, q5); 379ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); 380ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); 381ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); 382ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); 383ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // dct_const_round_shift 384ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); 385ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); 386ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); 387ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); 388ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); 389ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); 390ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); 391ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); 392ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Combine 393ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i r0 = _mm_packs_epi32(s0, s1); 394ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i r1 = _mm_packs_epi32(s2, s3); 395ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Add/substract 396ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i x0 = _mm_add_epi16(q4, r0); 397ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i x1 = _mm_sub_epi16(q4, r0); 398ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i x2 = _mm_sub_epi16(q7, r1); 399ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i x3 = _mm_add_epi16(q7, r1); 400ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Interleave to do the multiply by constants which gets us into 32bits 401ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t0 = _mm_unpacklo_epi16(x0, x3); 402ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t1 = _mm_unpackhi_epi16(x0, x3); 403ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t2 = _mm_unpacklo_epi16(x1, x2); 404ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t3 = _mm_unpackhi_epi16(x1, x2); 405ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); 406ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); 407ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); 408ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); 409ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); 410ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); 411ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); 412ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); 413ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // dct_const_round_shift 414ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 415ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 416ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 417ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 418ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 419ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 420ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 421ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 422ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 423ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 424ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 425ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 426ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 427ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 428ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 429ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 430ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Combine 431ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res1 = _mm_packs_epi32(w0, w1); 432ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res7 = _mm_packs_epi32(w2, w3); 433ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res5 = _mm_packs_epi32(w4, w5); 434ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res3 = _mm_packs_epi32(w6, w7); 435ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 436ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Transpose the 8x8. 437ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 438ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 00 01 02 03 04 05 06 07 439ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 10 11 12 13 14 15 16 17 440ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 20 21 22 23 24 25 26 27 441ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 30 31 32 33 34 35 36 37 442ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 40 41 42 43 44 45 46 47 443ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 50 51 52 53 54 55 56 57 444ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 60 61 62 63 64 65 66 67 445ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 70 71 72 73 74 75 76 77 446ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); 447ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); 448ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); 449ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); 450ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); 451ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); 452ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); 453ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); 454ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 00 10 01 11 02 12 03 13 455ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 20 30 21 31 22 32 23 33 456ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 04 14 05 15 06 16 07 17 457ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 24 34 25 35 26 36 27 37 458ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 40 50 41 51 42 52 43 53 459ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 60 70 61 71 62 72 63 73 460ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 54 54 55 55 56 56 57 57 461ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 64 74 65 75 66 76 67 77 462ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 463ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); 464ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 465ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); 466ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); 467ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 468ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); 469ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 470ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 00 10 20 30 01 11 21 31 471ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 40 50 60 70 41 51 61 71 472ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 02 12 22 32 03 13 23 33 473ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 42 52 62 72 43 53 63 73 474ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 04 14 24 34 05 15 21 36 475ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 44 54 64 74 45 55 61 76 476ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 06 16 26 36 07 17 27 37 477ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 46 56 66 76 47 57 67 77 478ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); 479ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); 480ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); 481ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); 482ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); 483ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); 484ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); 485ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); 486ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 00 10 20 30 40 50 60 70 487ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 01 11 21 31 41 51 61 71 488ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 02 12 22 32 42 52 62 72 489ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 03 13 23 33 43 53 63 73 490ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 04 14 24 34 44 54 64 74 491ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 05 15 25 35 45 55 65 75 492ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 06 16 26 36 46 56 66 76 493ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 07 17 27 37 47 57 67 77 494ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 495ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 496ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Post-condition output and store it 497ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 498ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Post-condition (division by two) 499ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // division of two 16 bits signed numbers using shifts 500ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // n / 2 = (n - (n >> 15)) >> 1 501ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i sign_in0 = _mm_srai_epi16(in0, 15); 502ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i sign_in1 = _mm_srai_epi16(in1, 15); 503ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i sign_in2 = _mm_srai_epi16(in2, 15); 504ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i sign_in3 = _mm_srai_epi16(in3, 15); 505ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i sign_in4 = _mm_srai_epi16(in4, 15); 506ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i sign_in5 = _mm_srai_epi16(in5, 15); 507ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i sign_in6 = _mm_srai_epi16(in6, 15); 508ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i sign_in7 = _mm_srai_epi16(in7, 15); 509ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_sub_epi16(in0, sign_in0); 510ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_sub_epi16(in1, sign_in1); 511ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_sub_epi16(in2, sign_in2); 512ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_sub_epi16(in3, sign_in3); 513ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4 = _mm_sub_epi16(in4, sign_in4); 514ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in5 = _mm_sub_epi16(in5, sign_in5); 515ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in6 = _mm_sub_epi16(in6, sign_in6); 516ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in7 = _mm_sub_epi16(in7, sign_in7); 517ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_srai_epi16(in0, 1); 518ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_srai_epi16(in1, 1); 519ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_srai_epi16(in2, 1); 520ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_srai_epi16(in3, 1); 521ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4 = _mm_srai_epi16(in4, 1); 522ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in5 = _mm_srai_epi16(in5, 1); 523ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in6 = _mm_srai_epi16(in6, 1); 524ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in7 = _mm_srai_epi16(in7, 1); 525ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // store results 52691037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_store_si128((__m128i *)(output + 0 * 8), in0); 52791037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_store_si128((__m128i *)(output + 1 * 8), in1); 52891037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_store_si128((__m128i *)(output + 2 * 8), in2); 52991037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_store_si128((__m128i *)(output + 3 * 8), in3); 53091037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_store_si128((__m128i *)(output + 4 * 8), in4); 53191037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_store_si128((__m128i *)(output + 5 * 8), in5); 53291037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_store_si128((__m128i *)(output + 6 * 8), in6); 53391037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_store_si128((__m128i *)(output + 7 * 8), in7); 53491037db265ecdd914a26e056cf69207b4f50924ehkuang } 53591037db265ecdd914a26e056cf69207b4f50924ehkuang} 53691037db265ecdd914a26e056cf69207b4f50924ehkuang 53791037db265ecdd914a26e056cf69207b4f50924ehkuang// load 8x8 array 53891037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void load_buffer_8x8(int16_t *input, __m128i *in, int stride) { 53991037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_load_si128((__m128i *)(input + 0 * stride)); 54091037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_load_si128((__m128i *)(input + 1 * stride)); 54191037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = _mm_load_si128((__m128i *)(input + 2 * stride)); 54291037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_load_si128((__m128i *)(input + 3 * stride)); 54391037db265ecdd914a26e056cf69207b4f50924ehkuang in[4] = _mm_load_si128((__m128i *)(input + 4 * stride)); 54491037db265ecdd914a26e056cf69207b4f50924ehkuang in[5] = _mm_load_si128((__m128i *)(input + 5 * stride)); 54591037db265ecdd914a26e056cf69207b4f50924ehkuang in[6] = _mm_load_si128((__m128i *)(input + 6 * stride)); 54691037db265ecdd914a26e056cf69207b4f50924ehkuang in[7] = _mm_load_si128((__m128i *)(input + 7 * stride)); 54791037db265ecdd914a26e056cf69207b4f50924ehkuang 54891037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_slli_epi16(in[0], 2); 54991037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_slli_epi16(in[1], 2); 55091037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = _mm_slli_epi16(in[2], 2); 55191037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_slli_epi16(in[3], 2); 55291037db265ecdd914a26e056cf69207b4f50924ehkuang in[4] = _mm_slli_epi16(in[4], 2); 55391037db265ecdd914a26e056cf69207b4f50924ehkuang in[5] = _mm_slli_epi16(in[5], 2); 55491037db265ecdd914a26e056cf69207b4f50924ehkuang in[6] = _mm_slli_epi16(in[6], 2); 55591037db265ecdd914a26e056cf69207b4f50924ehkuang in[7] = _mm_slli_epi16(in[7], 2); 55691037db265ecdd914a26e056cf69207b4f50924ehkuang} 55791037db265ecdd914a26e056cf69207b4f50924ehkuang 55891037db265ecdd914a26e056cf69207b4f50924ehkuang// right shift and rounding 55991037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void right_shift_8x8(__m128i *res, int const bit) { 56091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i kOne = _mm_set1_epi16(1); 56191037db265ecdd914a26e056cf69207b4f50924ehkuang const int bit_m02 = bit - 2; 56291037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i sign0 = _mm_srai_epi16(res[0], 15); 56391037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i sign1 = _mm_srai_epi16(res[1], 15); 56491037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i sign2 = _mm_srai_epi16(res[2], 15); 56591037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i sign3 = _mm_srai_epi16(res[3], 15); 56691037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i sign4 = _mm_srai_epi16(res[4], 15); 56791037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i sign5 = _mm_srai_epi16(res[5], 15); 56891037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i sign6 = _mm_srai_epi16(res[6], 15); 56991037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i sign7 = _mm_srai_epi16(res[7], 15); 57091037db265ecdd914a26e056cf69207b4f50924ehkuang 57191037db265ecdd914a26e056cf69207b4f50924ehkuang if (bit_m02 >= 0) { 57291037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02); 57391037db265ecdd914a26e056cf69207b4f50924ehkuang res[0] = _mm_add_epi16(res[0], k_const_rounding); 57491037db265ecdd914a26e056cf69207b4f50924ehkuang res[1] = _mm_add_epi16(res[1], k_const_rounding); 57591037db265ecdd914a26e056cf69207b4f50924ehkuang res[2] = _mm_add_epi16(res[2], k_const_rounding); 57691037db265ecdd914a26e056cf69207b4f50924ehkuang res[3] = _mm_add_epi16(res[3], k_const_rounding); 57791037db265ecdd914a26e056cf69207b4f50924ehkuang res[4] = _mm_add_epi16(res[4], k_const_rounding); 57891037db265ecdd914a26e056cf69207b4f50924ehkuang res[5] = _mm_add_epi16(res[5], k_const_rounding); 57991037db265ecdd914a26e056cf69207b4f50924ehkuang res[6] = _mm_add_epi16(res[6], k_const_rounding); 58091037db265ecdd914a26e056cf69207b4f50924ehkuang res[7] = _mm_add_epi16(res[7], k_const_rounding); 58191037db265ecdd914a26e056cf69207b4f50924ehkuang } 58291037db265ecdd914a26e056cf69207b4f50924ehkuang 58391037db265ecdd914a26e056cf69207b4f50924ehkuang res[0] = _mm_sub_epi16(res[0], sign0); 58491037db265ecdd914a26e056cf69207b4f50924ehkuang res[1] = _mm_sub_epi16(res[1], sign1); 58591037db265ecdd914a26e056cf69207b4f50924ehkuang res[2] = _mm_sub_epi16(res[2], sign2); 58691037db265ecdd914a26e056cf69207b4f50924ehkuang res[3] = _mm_sub_epi16(res[3], sign3); 58791037db265ecdd914a26e056cf69207b4f50924ehkuang res[4] = _mm_sub_epi16(res[4], sign4); 58891037db265ecdd914a26e056cf69207b4f50924ehkuang res[5] = _mm_sub_epi16(res[5], sign5); 58991037db265ecdd914a26e056cf69207b4f50924ehkuang res[6] = _mm_sub_epi16(res[6], sign6); 59091037db265ecdd914a26e056cf69207b4f50924ehkuang res[7] = _mm_sub_epi16(res[7], sign7); 59191037db265ecdd914a26e056cf69207b4f50924ehkuang 59291037db265ecdd914a26e056cf69207b4f50924ehkuang res[0] = _mm_srai_epi16(res[0], bit); 59391037db265ecdd914a26e056cf69207b4f50924ehkuang res[1] = _mm_srai_epi16(res[1], bit); 59491037db265ecdd914a26e056cf69207b4f50924ehkuang res[2] = _mm_srai_epi16(res[2], bit); 59591037db265ecdd914a26e056cf69207b4f50924ehkuang res[3] = _mm_srai_epi16(res[3], bit); 59691037db265ecdd914a26e056cf69207b4f50924ehkuang res[4] = _mm_srai_epi16(res[4], bit); 59791037db265ecdd914a26e056cf69207b4f50924ehkuang res[5] = _mm_srai_epi16(res[5], bit); 59891037db265ecdd914a26e056cf69207b4f50924ehkuang res[6] = _mm_srai_epi16(res[6], bit); 59991037db265ecdd914a26e056cf69207b4f50924ehkuang res[7] = _mm_srai_epi16(res[7], bit); 60091037db265ecdd914a26e056cf69207b4f50924ehkuang} 60191037db265ecdd914a26e056cf69207b4f50924ehkuang 60291037db265ecdd914a26e056cf69207b4f50924ehkuang// write 8x8 array 60391037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void write_buffer_8x8(int16_t *output, __m128i *res, int stride) { 60491037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_store_si128((__m128i *)(output + 0 * stride), res[0]); 60591037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_store_si128((__m128i *)(output + 1 * stride), res[1]); 60691037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_store_si128((__m128i *)(output + 2 * stride), res[2]); 60791037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_store_si128((__m128i *)(output + 3 * stride), res[3]); 60891037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_store_si128((__m128i *)(output + 4 * stride), res[4]); 60991037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_store_si128((__m128i *)(output + 5 * stride), res[5]); 61091037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_store_si128((__m128i *)(output + 6 * stride), res[6]); 61191037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_store_si128((__m128i *)(output + 7 * stride), res[7]); 61291037db265ecdd914a26e056cf69207b4f50924ehkuang} 61391037db265ecdd914a26e056cf69207b4f50924ehkuang 61491037db265ecdd914a26e056cf69207b4f50924ehkuang// perform in-place transpose 61591037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { 61691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); 61791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); 61891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); 61991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); 62091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); 62191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); 62291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); 62391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); 62491037db265ecdd914a26e056cf69207b4f50924ehkuang // 00 10 01 11 02 12 03 13 62591037db265ecdd914a26e056cf69207b4f50924ehkuang // 20 30 21 31 22 32 23 33 62691037db265ecdd914a26e056cf69207b4f50924ehkuang // 04 14 05 15 06 16 07 17 62791037db265ecdd914a26e056cf69207b4f50924ehkuang // 24 34 25 35 26 36 27 37 62891037db265ecdd914a26e056cf69207b4f50924ehkuang // 40 50 41 51 42 52 43 53 62991037db265ecdd914a26e056cf69207b4f50924ehkuang // 60 70 61 71 62 72 63 73 63091037db265ecdd914a26e056cf69207b4f50924ehkuang // 44 54 45 55 46 56 47 57 63191037db265ecdd914a26e056cf69207b4f50924ehkuang // 64 74 65 75 66 76 67 77 63291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 63391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); 63491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 63591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); 63691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); 63791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 63891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); 63991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 64091037db265ecdd914a26e056cf69207b4f50924ehkuang // 00 10 20 30 01 11 21 31 64191037db265ecdd914a26e056cf69207b4f50924ehkuang // 40 50 60 70 41 51 61 71 64291037db265ecdd914a26e056cf69207b4f50924ehkuang // 02 12 22 32 03 13 23 33 64391037db265ecdd914a26e056cf69207b4f50924ehkuang // 42 52 62 72 43 53 63 73 64491037db265ecdd914a26e056cf69207b4f50924ehkuang // 04 14 24 34 05 15 25 35 64591037db265ecdd914a26e056cf69207b4f50924ehkuang // 44 54 64 74 45 55 65 75 64691037db265ecdd914a26e056cf69207b4f50924ehkuang // 06 16 26 36 07 17 27 37 64791037db265ecdd914a26e056cf69207b4f50924ehkuang // 46 56 66 76 47 57 67 77 64891037db265ecdd914a26e056cf69207b4f50924ehkuang res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); 64991037db265ecdd914a26e056cf69207b4f50924ehkuang res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); 65091037db265ecdd914a26e056cf69207b4f50924ehkuang res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); 65191037db265ecdd914a26e056cf69207b4f50924ehkuang res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); 65291037db265ecdd914a26e056cf69207b4f50924ehkuang res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); 65391037db265ecdd914a26e056cf69207b4f50924ehkuang res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); 65491037db265ecdd914a26e056cf69207b4f50924ehkuang res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); 65591037db265ecdd914a26e056cf69207b4f50924ehkuang res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); 65691037db265ecdd914a26e056cf69207b4f50924ehkuang // 00 10 20 30 40 50 60 70 65791037db265ecdd914a26e056cf69207b4f50924ehkuang // 01 11 21 31 41 51 61 71 65891037db265ecdd914a26e056cf69207b4f50924ehkuang // 02 12 22 32 42 52 62 72 65991037db265ecdd914a26e056cf69207b4f50924ehkuang // 03 13 23 33 43 53 63 73 66091037db265ecdd914a26e056cf69207b4f50924ehkuang // 04 14 24 34 44 54 64 74 66191037db265ecdd914a26e056cf69207b4f50924ehkuang // 05 15 25 35 45 55 65 75 66291037db265ecdd914a26e056cf69207b4f50924ehkuang // 06 16 26 36 46 56 66 76 66391037db265ecdd914a26e056cf69207b4f50924ehkuang // 07 17 27 37 47 57 67 77 66491037db265ecdd914a26e056cf69207b4f50924ehkuang} 66591037db265ecdd914a26e056cf69207b4f50924ehkuang 66691037db265ecdd914a26e056cf69207b4f50924ehkuangvoid fdct8_1d_sse2(__m128i *in) { 66791037db265ecdd914a26e056cf69207b4f50924ehkuang // constants 66891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 66991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 67091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 67191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 67291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 67391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 67491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 67591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 67691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 67791037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i u0, u1, u2, u3, u4, u5, u6, u7; 67891037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i v0, v1, v2, v3, v4, v5, v6, v7; 67991037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s0, s1, s2, s3, s4, s5, s6, s7; 68091037db265ecdd914a26e056cf69207b4f50924ehkuang 68191037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 1 68291037db265ecdd914a26e056cf69207b4f50924ehkuang s0 = _mm_add_epi16(in[0], in[7]); 68391037db265ecdd914a26e056cf69207b4f50924ehkuang s1 = _mm_add_epi16(in[1], in[6]); 68491037db265ecdd914a26e056cf69207b4f50924ehkuang s2 = _mm_add_epi16(in[2], in[5]); 68591037db265ecdd914a26e056cf69207b4f50924ehkuang s3 = _mm_add_epi16(in[3], in[4]); 68691037db265ecdd914a26e056cf69207b4f50924ehkuang s4 = _mm_sub_epi16(in[3], in[4]); 68791037db265ecdd914a26e056cf69207b4f50924ehkuang s5 = _mm_sub_epi16(in[2], in[5]); 68891037db265ecdd914a26e056cf69207b4f50924ehkuang s6 = _mm_sub_epi16(in[1], in[6]); 68991037db265ecdd914a26e056cf69207b4f50924ehkuang s7 = _mm_sub_epi16(in[0], in[7]); 69091037db265ecdd914a26e056cf69207b4f50924ehkuang 69191037db265ecdd914a26e056cf69207b4f50924ehkuang u0 = _mm_add_epi16(s0, s3); 69291037db265ecdd914a26e056cf69207b4f50924ehkuang u1 = _mm_add_epi16(s1, s2); 69391037db265ecdd914a26e056cf69207b4f50924ehkuang u2 = _mm_sub_epi16(s1, s2); 69491037db265ecdd914a26e056cf69207b4f50924ehkuang u3 = _mm_sub_epi16(s0, s3); 69591037db265ecdd914a26e056cf69207b4f50924ehkuang // interleave and perform butterfly multiplication/addition 69691037db265ecdd914a26e056cf69207b4f50924ehkuang v0 = _mm_unpacklo_epi16(u0, u1); 69791037db265ecdd914a26e056cf69207b4f50924ehkuang v1 = _mm_unpackhi_epi16(u0, u1); 69891037db265ecdd914a26e056cf69207b4f50924ehkuang v2 = _mm_unpacklo_epi16(u2, u3); 69991037db265ecdd914a26e056cf69207b4f50924ehkuang v3 = _mm_unpackhi_epi16(u2, u3); 70091037db265ecdd914a26e056cf69207b4f50924ehkuang 70191037db265ecdd914a26e056cf69207b4f50924ehkuang u0 = _mm_madd_epi16(v0, k__cospi_p16_p16); 70291037db265ecdd914a26e056cf69207b4f50924ehkuang u1 = _mm_madd_epi16(v1, k__cospi_p16_p16); 70391037db265ecdd914a26e056cf69207b4f50924ehkuang u2 = _mm_madd_epi16(v0, k__cospi_p16_m16); 70491037db265ecdd914a26e056cf69207b4f50924ehkuang u3 = _mm_madd_epi16(v1, k__cospi_p16_m16); 70591037db265ecdd914a26e056cf69207b4f50924ehkuang u4 = _mm_madd_epi16(v2, k__cospi_p24_p08); 70691037db265ecdd914a26e056cf69207b4f50924ehkuang u5 = _mm_madd_epi16(v3, k__cospi_p24_p08); 70791037db265ecdd914a26e056cf69207b4f50924ehkuang u6 = _mm_madd_epi16(v2, k__cospi_m08_p24); 70891037db265ecdd914a26e056cf69207b4f50924ehkuang u7 = _mm_madd_epi16(v3, k__cospi_m08_p24); 70991037db265ecdd914a26e056cf69207b4f50924ehkuang 71091037db265ecdd914a26e056cf69207b4f50924ehkuang // shift and rounding 71191037db265ecdd914a26e056cf69207b4f50924ehkuang v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 71291037db265ecdd914a26e056cf69207b4f50924ehkuang v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 71391037db265ecdd914a26e056cf69207b4f50924ehkuang v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 71491037db265ecdd914a26e056cf69207b4f50924ehkuang v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 71591037db265ecdd914a26e056cf69207b4f50924ehkuang v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 71691037db265ecdd914a26e056cf69207b4f50924ehkuang v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 71791037db265ecdd914a26e056cf69207b4f50924ehkuang v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 71891037db265ecdd914a26e056cf69207b4f50924ehkuang v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 71991037db265ecdd914a26e056cf69207b4f50924ehkuang 72091037db265ecdd914a26e056cf69207b4f50924ehkuang u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 72191037db265ecdd914a26e056cf69207b4f50924ehkuang u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 72291037db265ecdd914a26e056cf69207b4f50924ehkuang u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 72391037db265ecdd914a26e056cf69207b4f50924ehkuang u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 72491037db265ecdd914a26e056cf69207b4f50924ehkuang u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 72591037db265ecdd914a26e056cf69207b4f50924ehkuang u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 72691037db265ecdd914a26e056cf69207b4f50924ehkuang u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 72791037db265ecdd914a26e056cf69207b4f50924ehkuang u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 72891037db265ecdd914a26e056cf69207b4f50924ehkuang 72991037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_packs_epi32(u0, u1); 73091037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = _mm_packs_epi32(u4, u5); 73191037db265ecdd914a26e056cf69207b4f50924ehkuang in[4] = _mm_packs_epi32(u2, u3); 73291037db265ecdd914a26e056cf69207b4f50924ehkuang in[6] = _mm_packs_epi32(u6, u7); 73391037db265ecdd914a26e056cf69207b4f50924ehkuang 73491037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 2 73591037db265ecdd914a26e056cf69207b4f50924ehkuang // interleave and perform butterfly multiplication/addition 73691037db265ecdd914a26e056cf69207b4f50924ehkuang u0 = _mm_unpacklo_epi16(s6, s5); 73791037db265ecdd914a26e056cf69207b4f50924ehkuang u1 = _mm_unpackhi_epi16(s6, s5); 73891037db265ecdd914a26e056cf69207b4f50924ehkuang v0 = _mm_madd_epi16(u0, k__cospi_p16_m16); 73991037db265ecdd914a26e056cf69207b4f50924ehkuang v1 = _mm_madd_epi16(u1, k__cospi_p16_m16); 74091037db265ecdd914a26e056cf69207b4f50924ehkuang v2 = _mm_madd_epi16(u0, k__cospi_p16_p16); 74191037db265ecdd914a26e056cf69207b4f50924ehkuang v3 = _mm_madd_epi16(u1, k__cospi_p16_p16); 74291037db265ecdd914a26e056cf69207b4f50924ehkuang 74391037db265ecdd914a26e056cf69207b4f50924ehkuang // shift and rounding 74491037db265ecdd914a26e056cf69207b4f50924ehkuang u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 74591037db265ecdd914a26e056cf69207b4f50924ehkuang u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 74691037db265ecdd914a26e056cf69207b4f50924ehkuang u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 74791037db265ecdd914a26e056cf69207b4f50924ehkuang u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 74891037db265ecdd914a26e056cf69207b4f50924ehkuang 74991037db265ecdd914a26e056cf69207b4f50924ehkuang v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 75091037db265ecdd914a26e056cf69207b4f50924ehkuang v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 75191037db265ecdd914a26e056cf69207b4f50924ehkuang v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 75291037db265ecdd914a26e056cf69207b4f50924ehkuang v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 75391037db265ecdd914a26e056cf69207b4f50924ehkuang 75491037db265ecdd914a26e056cf69207b4f50924ehkuang u0 = _mm_packs_epi32(v0, v1); 75591037db265ecdd914a26e056cf69207b4f50924ehkuang u1 = _mm_packs_epi32(v2, v3); 75691037db265ecdd914a26e056cf69207b4f50924ehkuang 75791037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 3 75891037db265ecdd914a26e056cf69207b4f50924ehkuang s0 = _mm_add_epi16(s4, u0); 75991037db265ecdd914a26e056cf69207b4f50924ehkuang s1 = _mm_sub_epi16(s4, u0); 76091037db265ecdd914a26e056cf69207b4f50924ehkuang s2 = _mm_sub_epi16(s7, u1); 76191037db265ecdd914a26e056cf69207b4f50924ehkuang s3 = _mm_add_epi16(s7, u1); 76291037db265ecdd914a26e056cf69207b4f50924ehkuang 76391037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 4 76491037db265ecdd914a26e056cf69207b4f50924ehkuang u0 = _mm_unpacklo_epi16(s0, s3); 76591037db265ecdd914a26e056cf69207b4f50924ehkuang u1 = _mm_unpackhi_epi16(s0, s3); 76691037db265ecdd914a26e056cf69207b4f50924ehkuang u2 = _mm_unpacklo_epi16(s1, s2); 76791037db265ecdd914a26e056cf69207b4f50924ehkuang u3 = _mm_unpackhi_epi16(s1, s2); 76891037db265ecdd914a26e056cf69207b4f50924ehkuang 76991037db265ecdd914a26e056cf69207b4f50924ehkuang v0 = _mm_madd_epi16(u0, k__cospi_p28_p04); 77091037db265ecdd914a26e056cf69207b4f50924ehkuang v1 = _mm_madd_epi16(u1, k__cospi_p28_p04); 77191037db265ecdd914a26e056cf69207b4f50924ehkuang v2 = _mm_madd_epi16(u2, k__cospi_p12_p20); 77291037db265ecdd914a26e056cf69207b4f50924ehkuang v3 = _mm_madd_epi16(u3, k__cospi_p12_p20); 77391037db265ecdd914a26e056cf69207b4f50924ehkuang v4 = _mm_madd_epi16(u2, k__cospi_m20_p12); 77491037db265ecdd914a26e056cf69207b4f50924ehkuang v5 = _mm_madd_epi16(u3, k__cospi_m20_p12); 77591037db265ecdd914a26e056cf69207b4f50924ehkuang v6 = _mm_madd_epi16(u0, k__cospi_m04_p28); 77691037db265ecdd914a26e056cf69207b4f50924ehkuang v7 = _mm_madd_epi16(u1, k__cospi_m04_p28); 77791037db265ecdd914a26e056cf69207b4f50924ehkuang 77891037db265ecdd914a26e056cf69207b4f50924ehkuang // shift and rounding 77991037db265ecdd914a26e056cf69207b4f50924ehkuang u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 78091037db265ecdd914a26e056cf69207b4f50924ehkuang u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 78191037db265ecdd914a26e056cf69207b4f50924ehkuang u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 78291037db265ecdd914a26e056cf69207b4f50924ehkuang u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 78391037db265ecdd914a26e056cf69207b4f50924ehkuang u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); 78491037db265ecdd914a26e056cf69207b4f50924ehkuang u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); 78591037db265ecdd914a26e056cf69207b4f50924ehkuang u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); 78691037db265ecdd914a26e056cf69207b4f50924ehkuang u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); 78791037db265ecdd914a26e056cf69207b4f50924ehkuang 78891037db265ecdd914a26e056cf69207b4f50924ehkuang v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 78991037db265ecdd914a26e056cf69207b4f50924ehkuang v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 79091037db265ecdd914a26e056cf69207b4f50924ehkuang v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 79191037db265ecdd914a26e056cf69207b4f50924ehkuang v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 79291037db265ecdd914a26e056cf69207b4f50924ehkuang v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); 79391037db265ecdd914a26e056cf69207b4f50924ehkuang v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); 79491037db265ecdd914a26e056cf69207b4f50924ehkuang v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); 79591037db265ecdd914a26e056cf69207b4f50924ehkuang v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); 79691037db265ecdd914a26e056cf69207b4f50924ehkuang 79791037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_packs_epi32(v0, v1); 79891037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_packs_epi32(v4, v5); 79991037db265ecdd914a26e056cf69207b4f50924ehkuang in[5] = _mm_packs_epi32(v2, v3); 80091037db265ecdd914a26e056cf69207b4f50924ehkuang in[7] = _mm_packs_epi32(v6, v7); 80191037db265ecdd914a26e056cf69207b4f50924ehkuang 80291037db265ecdd914a26e056cf69207b4f50924ehkuang // transpose 80391037db265ecdd914a26e056cf69207b4f50924ehkuang array_transpose_8x8(in, in); 80491037db265ecdd914a26e056cf69207b4f50924ehkuang} 80591037db265ecdd914a26e056cf69207b4f50924ehkuang 80691037db265ecdd914a26e056cf69207b4f50924ehkuangvoid fadst8_1d_sse2(__m128i *in) { 80791037db265ecdd914a26e056cf69207b4f50924ehkuang // Constants 80891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); 80991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); 81091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); 81191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); 81291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); 81391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); 81491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); 81591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); 81691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 81791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 81891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 81991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 82091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 82191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__const_0 = _mm_set1_epi16(0); 82291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 82391037db265ecdd914a26e056cf69207b4f50924ehkuang 82491037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; 82591037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; 82691037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; 82791037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s0, s1, s2, s3, s4, s5, s6, s7; 82891037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in0, in1, in2, in3, in4, in5, in6, in7; 82991037db265ecdd914a26e056cf69207b4f50924ehkuang 83091037db265ecdd914a26e056cf69207b4f50924ehkuang // properly aligned for butterfly input 83191037db265ecdd914a26e056cf69207b4f50924ehkuang in0 = in[7]; 83291037db265ecdd914a26e056cf69207b4f50924ehkuang in1 = in[0]; 83391037db265ecdd914a26e056cf69207b4f50924ehkuang in2 = in[5]; 83491037db265ecdd914a26e056cf69207b4f50924ehkuang in3 = in[2]; 83591037db265ecdd914a26e056cf69207b4f50924ehkuang in4 = in[3]; 83691037db265ecdd914a26e056cf69207b4f50924ehkuang in5 = in[4]; 83791037db265ecdd914a26e056cf69207b4f50924ehkuang in6 = in[1]; 83891037db265ecdd914a26e056cf69207b4f50924ehkuang in7 = in[6]; 83991037db265ecdd914a26e056cf69207b4f50924ehkuang 84091037db265ecdd914a26e056cf69207b4f50924ehkuang // column transformation 84191037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 1 84291037db265ecdd914a26e056cf69207b4f50924ehkuang // interleave and multiply/add into 32-bit integer 84391037db265ecdd914a26e056cf69207b4f50924ehkuang s0 = _mm_unpacklo_epi16(in0, in1); 84491037db265ecdd914a26e056cf69207b4f50924ehkuang s1 = _mm_unpackhi_epi16(in0, in1); 84591037db265ecdd914a26e056cf69207b4f50924ehkuang s2 = _mm_unpacklo_epi16(in2, in3); 84691037db265ecdd914a26e056cf69207b4f50924ehkuang s3 = _mm_unpackhi_epi16(in2, in3); 84791037db265ecdd914a26e056cf69207b4f50924ehkuang s4 = _mm_unpacklo_epi16(in4, in5); 84891037db265ecdd914a26e056cf69207b4f50924ehkuang s5 = _mm_unpackhi_epi16(in4, in5); 84991037db265ecdd914a26e056cf69207b4f50924ehkuang s6 = _mm_unpacklo_epi16(in6, in7); 85091037db265ecdd914a26e056cf69207b4f50924ehkuang s7 = _mm_unpackhi_epi16(in6, in7); 85191037db265ecdd914a26e056cf69207b4f50924ehkuang 85291037db265ecdd914a26e056cf69207b4f50924ehkuang u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); 85391037db265ecdd914a26e056cf69207b4f50924ehkuang u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); 85491037db265ecdd914a26e056cf69207b4f50924ehkuang u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); 85591037db265ecdd914a26e056cf69207b4f50924ehkuang u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); 85691037db265ecdd914a26e056cf69207b4f50924ehkuang u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); 85791037db265ecdd914a26e056cf69207b4f50924ehkuang u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); 85891037db265ecdd914a26e056cf69207b4f50924ehkuang u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); 85991037db265ecdd914a26e056cf69207b4f50924ehkuang u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); 86091037db265ecdd914a26e056cf69207b4f50924ehkuang u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); 86191037db265ecdd914a26e056cf69207b4f50924ehkuang u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); 86291037db265ecdd914a26e056cf69207b4f50924ehkuang u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); 86391037db265ecdd914a26e056cf69207b4f50924ehkuang u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); 86491037db265ecdd914a26e056cf69207b4f50924ehkuang u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); 86591037db265ecdd914a26e056cf69207b4f50924ehkuang u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); 86691037db265ecdd914a26e056cf69207b4f50924ehkuang u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); 86791037db265ecdd914a26e056cf69207b4f50924ehkuang u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); 86891037db265ecdd914a26e056cf69207b4f50924ehkuang 86991037db265ecdd914a26e056cf69207b4f50924ehkuang // addition 87091037db265ecdd914a26e056cf69207b4f50924ehkuang w0 = _mm_add_epi32(u0, u8); 87191037db265ecdd914a26e056cf69207b4f50924ehkuang w1 = _mm_add_epi32(u1, u9); 87291037db265ecdd914a26e056cf69207b4f50924ehkuang w2 = _mm_add_epi32(u2, u10); 87391037db265ecdd914a26e056cf69207b4f50924ehkuang w3 = _mm_add_epi32(u3, u11); 87491037db265ecdd914a26e056cf69207b4f50924ehkuang w4 = _mm_add_epi32(u4, u12); 87591037db265ecdd914a26e056cf69207b4f50924ehkuang w5 = _mm_add_epi32(u5, u13); 87691037db265ecdd914a26e056cf69207b4f50924ehkuang w6 = _mm_add_epi32(u6, u14); 87791037db265ecdd914a26e056cf69207b4f50924ehkuang w7 = _mm_add_epi32(u7, u15); 87891037db265ecdd914a26e056cf69207b4f50924ehkuang w8 = _mm_sub_epi32(u0, u8); 87991037db265ecdd914a26e056cf69207b4f50924ehkuang w9 = _mm_sub_epi32(u1, u9); 88091037db265ecdd914a26e056cf69207b4f50924ehkuang w10 = _mm_sub_epi32(u2, u10); 88191037db265ecdd914a26e056cf69207b4f50924ehkuang w11 = _mm_sub_epi32(u3, u11); 88291037db265ecdd914a26e056cf69207b4f50924ehkuang w12 = _mm_sub_epi32(u4, u12); 88391037db265ecdd914a26e056cf69207b4f50924ehkuang w13 = _mm_sub_epi32(u5, u13); 88491037db265ecdd914a26e056cf69207b4f50924ehkuang w14 = _mm_sub_epi32(u6, u14); 88591037db265ecdd914a26e056cf69207b4f50924ehkuang w15 = _mm_sub_epi32(u7, u15); 88691037db265ecdd914a26e056cf69207b4f50924ehkuang 88791037db265ecdd914a26e056cf69207b4f50924ehkuang // shift and rounding 88891037db265ecdd914a26e056cf69207b4f50924ehkuang v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 88991037db265ecdd914a26e056cf69207b4f50924ehkuang v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 89091037db265ecdd914a26e056cf69207b4f50924ehkuang v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 89191037db265ecdd914a26e056cf69207b4f50924ehkuang v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 89291037db265ecdd914a26e056cf69207b4f50924ehkuang v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 89391037db265ecdd914a26e056cf69207b4f50924ehkuang v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 89491037db265ecdd914a26e056cf69207b4f50924ehkuang v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 89591037db265ecdd914a26e056cf69207b4f50924ehkuang v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 89691037db265ecdd914a26e056cf69207b4f50924ehkuang v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); 89791037db265ecdd914a26e056cf69207b4f50924ehkuang v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); 89891037db265ecdd914a26e056cf69207b4f50924ehkuang v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); 89991037db265ecdd914a26e056cf69207b4f50924ehkuang v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); 90091037db265ecdd914a26e056cf69207b4f50924ehkuang v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); 90191037db265ecdd914a26e056cf69207b4f50924ehkuang v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); 90291037db265ecdd914a26e056cf69207b4f50924ehkuang v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); 90391037db265ecdd914a26e056cf69207b4f50924ehkuang v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); 90491037db265ecdd914a26e056cf69207b4f50924ehkuang 90591037db265ecdd914a26e056cf69207b4f50924ehkuang u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 90691037db265ecdd914a26e056cf69207b4f50924ehkuang u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 90791037db265ecdd914a26e056cf69207b4f50924ehkuang u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 90891037db265ecdd914a26e056cf69207b4f50924ehkuang u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 90991037db265ecdd914a26e056cf69207b4f50924ehkuang u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 91091037db265ecdd914a26e056cf69207b4f50924ehkuang u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 91191037db265ecdd914a26e056cf69207b4f50924ehkuang u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 91291037db265ecdd914a26e056cf69207b4f50924ehkuang u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 91391037db265ecdd914a26e056cf69207b4f50924ehkuang u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); 91491037db265ecdd914a26e056cf69207b4f50924ehkuang u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); 91591037db265ecdd914a26e056cf69207b4f50924ehkuang u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); 91691037db265ecdd914a26e056cf69207b4f50924ehkuang u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); 91791037db265ecdd914a26e056cf69207b4f50924ehkuang u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); 91891037db265ecdd914a26e056cf69207b4f50924ehkuang u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); 91991037db265ecdd914a26e056cf69207b4f50924ehkuang u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); 92091037db265ecdd914a26e056cf69207b4f50924ehkuang u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); 92191037db265ecdd914a26e056cf69207b4f50924ehkuang 92291037db265ecdd914a26e056cf69207b4f50924ehkuang // back to 16-bit and pack 8 integers into __m128i 92391037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_packs_epi32(u0, u1); 92491037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_packs_epi32(u2, u3); 92591037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = _mm_packs_epi32(u4, u5); 92691037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_packs_epi32(u6, u7); 92791037db265ecdd914a26e056cf69207b4f50924ehkuang in[4] = _mm_packs_epi32(u8, u9); 92891037db265ecdd914a26e056cf69207b4f50924ehkuang in[5] = _mm_packs_epi32(u10, u11); 92991037db265ecdd914a26e056cf69207b4f50924ehkuang in[6] = _mm_packs_epi32(u12, u13); 93091037db265ecdd914a26e056cf69207b4f50924ehkuang in[7] = _mm_packs_epi32(u14, u15); 93191037db265ecdd914a26e056cf69207b4f50924ehkuang 93291037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 2 93391037db265ecdd914a26e056cf69207b4f50924ehkuang s0 = _mm_add_epi16(in[0], in[2]); 93491037db265ecdd914a26e056cf69207b4f50924ehkuang s1 = _mm_add_epi16(in[1], in[3]); 93591037db265ecdd914a26e056cf69207b4f50924ehkuang s2 = _mm_sub_epi16(in[0], in[2]); 93691037db265ecdd914a26e056cf69207b4f50924ehkuang s3 = _mm_sub_epi16(in[1], in[3]); 93791037db265ecdd914a26e056cf69207b4f50924ehkuang u0 = _mm_unpacklo_epi16(in[4], in[5]); 93891037db265ecdd914a26e056cf69207b4f50924ehkuang u1 = _mm_unpackhi_epi16(in[4], in[5]); 93991037db265ecdd914a26e056cf69207b4f50924ehkuang u2 = _mm_unpacklo_epi16(in[6], in[7]); 94091037db265ecdd914a26e056cf69207b4f50924ehkuang u3 = _mm_unpackhi_epi16(in[6], in[7]); 94191037db265ecdd914a26e056cf69207b4f50924ehkuang 94291037db265ecdd914a26e056cf69207b4f50924ehkuang v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); 94391037db265ecdd914a26e056cf69207b4f50924ehkuang v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); 94491037db265ecdd914a26e056cf69207b4f50924ehkuang v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); 94591037db265ecdd914a26e056cf69207b4f50924ehkuang v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); 94691037db265ecdd914a26e056cf69207b4f50924ehkuang v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); 94791037db265ecdd914a26e056cf69207b4f50924ehkuang v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); 94891037db265ecdd914a26e056cf69207b4f50924ehkuang v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); 94991037db265ecdd914a26e056cf69207b4f50924ehkuang v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); 95091037db265ecdd914a26e056cf69207b4f50924ehkuang 95191037db265ecdd914a26e056cf69207b4f50924ehkuang w0 = _mm_add_epi32(v0, v4); 95291037db265ecdd914a26e056cf69207b4f50924ehkuang w1 = _mm_add_epi32(v1, v5); 95391037db265ecdd914a26e056cf69207b4f50924ehkuang w2 = _mm_add_epi32(v2, v6); 95491037db265ecdd914a26e056cf69207b4f50924ehkuang w3 = _mm_add_epi32(v3, v7); 95591037db265ecdd914a26e056cf69207b4f50924ehkuang w4 = _mm_sub_epi32(v0, v4); 95691037db265ecdd914a26e056cf69207b4f50924ehkuang w5 = _mm_sub_epi32(v1, v5); 95791037db265ecdd914a26e056cf69207b4f50924ehkuang w6 = _mm_sub_epi32(v2, v6); 95891037db265ecdd914a26e056cf69207b4f50924ehkuang w7 = _mm_sub_epi32(v3, v7); 95991037db265ecdd914a26e056cf69207b4f50924ehkuang 96091037db265ecdd914a26e056cf69207b4f50924ehkuang v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 96191037db265ecdd914a26e056cf69207b4f50924ehkuang v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 96291037db265ecdd914a26e056cf69207b4f50924ehkuang v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 96391037db265ecdd914a26e056cf69207b4f50924ehkuang v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 96491037db265ecdd914a26e056cf69207b4f50924ehkuang v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 96591037db265ecdd914a26e056cf69207b4f50924ehkuang v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 96691037db265ecdd914a26e056cf69207b4f50924ehkuang v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 96791037db265ecdd914a26e056cf69207b4f50924ehkuang v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 96891037db265ecdd914a26e056cf69207b4f50924ehkuang 96991037db265ecdd914a26e056cf69207b4f50924ehkuang u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 97091037db265ecdd914a26e056cf69207b4f50924ehkuang u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 97191037db265ecdd914a26e056cf69207b4f50924ehkuang u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 97291037db265ecdd914a26e056cf69207b4f50924ehkuang u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 97391037db265ecdd914a26e056cf69207b4f50924ehkuang u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 97491037db265ecdd914a26e056cf69207b4f50924ehkuang u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 97591037db265ecdd914a26e056cf69207b4f50924ehkuang u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 97691037db265ecdd914a26e056cf69207b4f50924ehkuang u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 97791037db265ecdd914a26e056cf69207b4f50924ehkuang 97891037db265ecdd914a26e056cf69207b4f50924ehkuang // back to 16-bit intergers 97991037db265ecdd914a26e056cf69207b4f50924ehkuang s4 = _mm_packs_epi32(u0, u1); 98091037db265ecdd914a26e056cf69207b4f50924ehkuang s5 = _mm_packs_epi32(u2, u3); 98191037db265ecdd914a26e056cf69207b4f50924ehkuang s6 = _mm_packs_epi32(u4, u5); 98291037db265ecdd914a26e056cf69207b4f50924ehkuang s7 = _mm_packs_epi32(u6, u7); 98391037db265ecdd914a26e056cf69207b4f50924ehkuang 98491037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 3 98591037db265ecdd914a26e056cf69207b4f50924ehkuang u0 = _mm_unpacklo_epi16(s2, s3); 98691037db265ecdd914a26e056cf69207b4f50924ehkuang u1 = _mm_unpackhi_epi16(s2, s3); 98791037db265ecdd914a26e056cf69207b4f50924ehkuang u2 = _mm_unpacklo_epi16(s6, s7); 98891037db265ecdd914a26e056cf69207b4f50924ehkuang u3 = _mm_unpackhi_epi16(s6, s7); 98991037db265ecdd914a26e056cf69207b4f50924ehkuang 99091037db265ecdd914a26e056cf69207b4f50924ehkuang v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); 99191037db265ecdd914a26e056cf69207b4f50924ehkuang v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); 99291037db265ecdd914a26e056cf69207b4f50924ehkuang v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); 99391037db265ecdd914a26e056cf69207b4f50924ehkuang v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); 99491037db265ecdd914a26e056cf69207b4f50924ehkuang v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); 99591037db265ecdd914a26e056cf69207b4f50924ehkuang v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); 99691037db265ecdd914a26e056cf69207b4f50924ehkuang v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); 99791037db265ecdd914a26e056cf69207b4f50924ehkuang v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); 99891037db265ecdd914a26e056cf69207b4f50924ehkuang 99991037db265ecdd914a26e056cf69207b4f50924ehkuang u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 100091037db265ecdd914a26e056cf69207b4f50924ehkuang u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 100191037db265ecdd914a26e056cf69207b4f50924ehkuang u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 100291037db265ecdd914a26e056cf69207b4f50924ehkuang u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 100391037db265ecdd914a26e056cf69207b4f50924ehkuang u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); 100491037db265ecdd914a26e056cf69207b4f50924ehkuang u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); 100591037db265ecdd914a26e056cf69207b4f50924ehkuang u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); 100691037db265ecdd914a26e056cf69207b4f50924ehkuang u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); 100791037db265ecdd914a26e056cf69207b4f50924ehkuang 100891037db265ecdd914a26e056cf69207b4f50924ehkuang v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 100991037db265ecdd914a26e056cf69207b4f50924ehkuang v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 101091037db265ecdd914a26e056cf69207b4f50924ehkuang v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 101191037db265ecdd914a26e056cf69207b4f50924ehkuang v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 101291037db265ecdd914a26e056cf69207b4f50924ehkuang v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); 101391037db265ecdd914a26e056cf69207b4f50924ehkuang v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); 101491037db265ecdd914a26e056cf69207b4f50924ehkuang v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); 101591037db265ecdd914a26e056cf69207b4f50924ehkuang v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); 101691037db265ecdd914a26e056cf69207b4f50924ehkuang 101791037db265ecdd914a26e056cf69207b4f50924ehkuang s2 = _mm_packs_epi32(v0, v1); 101891037db265ecdd914a26e056cf69207b4f50924ehkuang s3 = _mm_packs_epi32(v2, v3); 101991037db265ecdd914a26e056cf69207b4f50924ehkuang s6 = _mm_packs_epi32(v4, v5); 102091037db265ecdd914a26e056cf69207b4f50924ehkuang s7 = _mm_packs_epi32(v6, v7); 102191037db265ecdd914a26e056cf69207b4f50924ehkuang 102291037db265ecdd914a26e056cf69207b4f50924ehkuang // FIXME(jingning): do subtract using bit inversion? 102391037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = s0; 102491037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_sub_epi16(k__const_0, s4); 102591037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = s6; 102691037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_sub_epi16(k__const_0, s2); 102791037db265ecdd914a26e056cf69207b4f50924ehkuang in[4] = s3; 102891037db265ecdd914a26e056cf69207b4f50924ehkuang in[5] = _mm_sub_epi16(k__const_0, s7); 102991037db265ecdd914a26e056cf69207b4f50924ehkuang in[6] = s5; 103091037db265ecdd914a26e056cf69207b4f50924ehkuang in[7] = _mm_sub_epi16(k__const_0, s1); 103191037db265ecdd914a26e056cf69207b4f50924ehkuang 103291037db265ecdd914a26e056cf69207b4f50924ehkuang // transpose 103391037db265ecdd914a26e056cf69207b4f50924ehkuang array_transpose_8x8(in, in); 103491037db265ecdd914a26e056cf69207b4f50924ehkuang} 103591037db265ecdd914a26e056cf69207b4f50924ehkuang 103691037db265ecdd914a26e056cf69207b4f50924ehkuangvoid vp9_short_fht8x8_sse2(int16_t *input, int16_t *output, 103791037db265ecdd914a26e056cf69207b4f50924ehkuang int stride, int tx_type) { 103891037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in[8]; 103991037db265ecdd914a26e056cf69207b4f50924ehkuang load_buffer_8x8(input, in, stride); 104091037db265ecdd914a26e056cf69207b4f50924ehkuang switch (tx_type) { 104191037db265ecdd914a26e056cf69207b4f50924ehkuang case 0: // DCT_DCT 104291037db265ecdd914a26e056cf69207b4f50924ehkuang fdct8_1d_sse2(in); 104391037db265ecdd914a26e056cf69207b4f50924ehkuang fdct8_1d_sse2(in); 104491037db265ecdd914a26e056cf69207b4f50924ehkuang break; 104591037db265ecdd914a26e056cf69207b4f50924ehkuang case 1: // ADST_DCT 104691037db265ecdd914a26e056cf69207b4f50924ehkuang fadst8_1d_sse2(in); 104791037db265ecdd914a26e056cf69207b4f50924ehkuang fdct8_1d_sse2(in); 104891037db265ecdd914a26e056cf69207b4f50924ehkuang break; 104991037db265ecdd914a26e056cf69207b4f50924ehkuang case 2: // DCT_ADST 105091037db265ecdd914a26e056cf69207b4f50924ehkuang fdct8_1d_sse2(in); 105191037db265ecdd914a26e056cf69207b4f50924ehkuang fadst8_1d_sse2(in); 105291037db265ecdd914a26e056cf69207b4f50924ehkuang break; 105391037db265ecdd914a26e056cf69207b4f50924ehkuang case 3: // ADST_ADST 105491037db265ecdd914a26e056cf69207b4f50924ehkuang fadst8_1d_sse2(in); 105591037db265ecdd914a26e056cf69207b4f50924ehkuang fadst8_1d_sse2(in); 105691037db265ecdd914a26e056cf69207b4f50924ehkuang break; 105791037db265ecdd914a26e056cf69207b4f50924ehkuang default: 105891037db265ecdd914a26e056cf69207b4f50924ehkuang assert(0); 105991037db265ecdd914a26e056cf69207b4f50924ehkuang break; 1060ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 106191037db265ecdd914a26e056cf69207b4f50924ehkuang right_shift_8x8(in, 1); 106291037db265ecdd914a26e056cf69207b4f50924ehkuang write_buffer_8x8(output, in, 8); 1063ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang} 1064ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1065ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangvoid vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) { 1066ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // The 2D transform is done with two passes which are actually pretty 1067ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // similar. In the first one, we transform the columns and transpose 1068ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // the results. In the second one, we transform the rows. To achieve that, 1069ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // as the first pass results are transposed, we tranpose the columns (that 1070ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // is the transposed rows) and transpose the results (so that it goes back 1071ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // in normal/row positions). 1072ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const int stride = pitch >> 1; 1073ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang int pass; 1074ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // We need an intermediate buffer between passes. 107591037db265ecdd914a26e056cf69207b4f50924ehkuang DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256); 1076ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang int16_t *in = input; 1077ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang int16_t *out = intermediate; 1078ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Constants 1079ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // When we use them, in one case, they are all the same. In all others 1080ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // it's a pair of them that we need to repeat four times. This is done 1081ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // by constructing the 32 bit constant corresponding to that pair. 1082ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 1083ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1084ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 1085ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1086ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1087ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 1088ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 1089ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 1090ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 1091ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); 1092ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); 1093ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); 1094ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); 1095ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); 1096ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); 1097ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); 1098ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); 1099ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i kOne = _mm_set1_epi16(1); 1101ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Do the two transform/transpose passes 1102ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang for (pass = 0; pass < 2; ++pass) { 1103ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // We process eight columns (transposed rows in second pass) at a time. 1104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang int column_start; 1105ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang for (column_start = 0; column_start < 16; column_start += 8) { 1106ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i in00, in01, in02, in03, in04, in05, in06, in07; 1107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i in08, in09, in10, in11, in12, in13, in14, in15; 1108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i input0, input1, input2, input3, input4, input5, input6, input7; 1109ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i step1_0, step1_1, step1_2, step1_3; 1110ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i step1_4, step1_5, step1_6, step1_7; 1111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; 1112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i step3_0, step3_1, step3_2, step3_3; 1113ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i step3_4, step3_5, step3_6, step3_7; 1114ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i res00, res01, res02, res03, res04, res05, res06, res07; 1115ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i res08, res09, res10, res11, res12, res13, res14, res15; 1116ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Load and pre-condition input. 1117ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang if (0 == pass) { 111891037db265ecdd914a26e056cf69207b4f50924ehkuang in00 = _mm_load_si128((const __m128i *)(in + 0 * stride)); 111991037db265ecdd914a26e056cf69207b4f50924ehkuang in01 = _mm_load_si128((const __m128i *)(in + 1 * stride)); 112091037db265ecdd914a26e056cf69207b4f50924ehkuang in02 = _mm_load_si128((const __m128i *)(in + 2 * stride)); 112191037db265ecdd914a26e056cf69207b4f50924ehkuang in03 = _mm_load_si128((const __m128i *)(in + 3 * stride)); 112291037db265ecdd914a26e056cf69207b4f50924ehkuang in04 = _mm_load_si128((const __m128i *)(in + 4 * stride)); 112391037db265ecdd914a26e056cf69207b4f50924ehkuang in05 = _mm_load_si128((const __m128i *)(in + 5 * stride)); 112491037db265ecdd914a26e056cf69207b4f50924ehkuang in06 = _mm_load_si128((const __m128i *)(in + 6 * stride)); 112591037db265ecdd914a26e056cf69207b4f50924ehkuang in07 = _mm_load_si128((const __m128i *)(in + 7 * stride)); 112691037db265ecdd914a26e056cf69207b4f50924ehkuang in08 = _mm_load_si128((const __m128i *)(in + 8 * stride)); 112791037db265ecdd914a26e056cf69207b4f50924ehkuang in09 = _mm_load_si128((const __m128i *)(in + 9 * stride)); 112891037db265ecdd914a26e056cf69207b4f50924ehkuang in10 = _mm_load_si128((const __m128i *)(in + 10 * stride)); 112991037db265ecdd914a26e056cf69207b4f50924ehkuang in11 = _mm_load_si128((const __m128i *)(in + 11 * stride)); 113091037db265ecdd914a26e056cf69207b4f50924ehkuang in12 = _mm_load_si128((const __m128i *)(in + 12 * stride)); 113191037db265ecdd914a26e056cf69207b4f50924ehkuang in13 = _mm_load_si128((const __m128i *)(in + 13 * stride)); 113291037db265ecdd914a26e056cf69207b4f50924ehkuang in14 = _mm_load_si128((const __m128i *)(in + 14 * stride)); 113391037db265ecdd914a26e056cf69207b4f50924ehkuang in15 = _mm_load_si128((const __m128i *)(in + 15 * stride)); 1134ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // x = x << 2 1135ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in00 = _mm_slli_epi16(in00, 2); 1136ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in01 = _mm_slli_epi16(in01, 2); 1137ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in02 = _mm_slli_epi16(in02, 2); 1138ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in03 = _mm_slli_epi16(in03, 2); 1139ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in04 = _mm_slli_epi16(in04, 2); 1140ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in05 = _mm_slli_epi16(in05, 2); 1141ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in06 = _mm_slli_epi16(in06, 2); 1142ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in07 = _mm_slli_epi16(in07, 2); 1143ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in08 = _mm_slli_epi16(in08, 2); 1144ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in09 = _mm_slli_epi16(in09, 2); 1145ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in10 = _mm_slli_epi16(in10, 2); 1146ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in11 = _mm_slli_epi16(in11, 2); 1147ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in12 = _mm_slli_epi16(in12, 2); 1148ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in13 = _mm_slli_epi16(in13, 2); 1149ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in14 = _mm_slli_epi16(in14, 2); 1150ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in15 = _mm_slli_epi16(in15, 2); 1151ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } else { 115291037db265ecdd914a26e056cf69207b4f50924ehkuang in00 = _mm_load_si128((const __m128i *)(in + 0 * 16)); 115391037db265ecdd914a26e056cf69207b4f50924ehkuang in01 = _mm_load_si128((const __m128i *)(in + 1 * 16)); 115491037db265ecdd914a26e056cf69207b4f50924ehkuang in02 = _mm_load_si128((const __m128i *)(in + 2 * 16)); 115591037db265ecdd914a26e056cf69207b4f50924ehkuang in03 = _mm_load_si128((const __m128i *)(in + 3 * 16)); 115691037db265ecdd914a26e056cf69207b4f50924ehkuang in04 = _mm_load_si128((const __m128i *)(in + 4 * 16)); 115791037db265ecdd914a26e056cf69207b4f50924ehkuang in05 = _mm_load_si128((const __m128i *)(in + 5 * 16)); 115891037db265ecdd914a26e056cf69207b4f50924ehkuang in06 = _mm_load_si128((const __m128i *)(in + 6 * 16)); 115991037db265ecdd914a26e056cf69207b4f50924ehkuang in07 = _mm_load_si128((const __m128i *)(in + 7 * 16)); 116091037db265ecdd914a26e056cf69207b4f50924ehkuang in08 = _mm_load_si128((const __m128i *)(in + 8 * 16)); 116191037db265ecdd914a26e056cf69207b4f50924ehkuang in09 = _mm_load_si128((const __m128i *)(in + 9 * 16)); 116291037db265ecdd914a26e056cf69207b4f50924ehkuang in10 = _mm_load_si128((const __m128i *)(in + 10 * 16)); 116391037db265ecdd914a26e056cf69207b4f50924ehkuang in11 = _mm_load_si128((const __m128i *)(in + 11 * 16)); 116491037db265ecdd914a26e056cf69207b4f50924ehkuang in12 = _mm_load_si128((const __m128i *)(in + 12 * 16)); 116591037db265ecdd914a26e056cf69207b4f50924ehkuang in13 = _mm_load_si128((const __m128i *)(in + 13 * 16)); 116691037db265ecdd914a26e056cf69207b4f50924ehkuang in14 = _mm_load_si128((const __m128i *)(in + 14 * 16)); 116791037db265ecdd914a26e056cf69207b4f50924ehkuang in15 = _mm_load_si128((const __m128i *)(in + 15 * 16)); 1168ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // x = (x + 1) >> 2 1169ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in00 = _mm_add_epi16(in00, kOne); 1170ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in01 = _mm_add_epi16(in01, kOne); 1171ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in02 = _mm_add_epi16(in02, kOne); 1172ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in03 = _mm_add_epi16(in03, kOne); 1173ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in04 = _mm_add_epi16(in04, kOne); 1174ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in05 = _mm_add_epi16(in05, kOne); 1175ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in06 = _mm_add_epi16(in06, kOne); 1176ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in07 = _mm_add_epi16(in07, kOne); 1177ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in08 = _mm_add_epi16(in08, kOne); 1178ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in09 = _mm_add_epi16(in09, kOne); 1179ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in10 = _mm_add_epi16(in10, kOne); 1180ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in11 = _mm_add_epi16(in11, kOne); 1181ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in12 = _mm_add_epi16(in12, kOne); 1182ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in13 = _mm_add_epi16(in13, kOne); 1183ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in14 = _mm_add_epi16(in14, kOne); 1184ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in15 = _mm_add_epi16(in15, kOne); 1185ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in00 = _mm_srai_epi16(in00, 2); 1186ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in01 = _mm_srai_epi16(in01, 2); 1187ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in02 = _mm_srai_epi16(in02, 2); 1188ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in03 = _mm_srai_epi16(in03, 2); 1189ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in04 = _mm_srai_epi16(in04, 2); 1190ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in05 = _mm_srai_epi16(in05, 2); 1191ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in06 = _mm_srai_epi16(in06, 2); 1192ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in07 = _mm_srai_epi16(in07, 2); 1193ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in08 = _mm_srai_epi16(in08, 2); 1194ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in09 = _mm_srai_epi16(in09, 2); 1195ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in10 = _mm_srai_epi16(in10, 2); 1196ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in11 = _mm_srai_epi16(in11, 2); 1197ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in12 = _mm_srai_epi16(in12, 2); 1198ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in13 = _mm_srai_epi16(in13, 2); 1199ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in14 = _mm_srai_epi16(in14, 2); 1200ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in15 = _mm_srai_epi16(in15, 2); 1201ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1202ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in += 8; 1203ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Calculate input for the first 8 results. 1204ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 1205ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input0 = _mm_add_epi16(in00, in15); 1206ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input1 = _mm_add_epi16(in01, in14); 1207ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input2 = _mm_add_epi16(in02, in13); 1208ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input3 = _mm_add_epi16(in03, in12); 1209ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input4 = _mm_add_epi16(in04, in11); 1210ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input5 = _mm_add_epi16(in05, in10); 1211ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input6 = _mm_add_epi16(in06, in09); 1212ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input7 = _mm_add_epi16(in07, in08); 1213ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1214ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Calculate input for the next 8 results. 1215ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 1216ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step1_0 = _mm_sub_epi16(in07, in08); 1217ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step1_1 = _mm_sub_epi16(in06, in09); 1218ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step1_2 = _mm_sub_epi16(in05, in10); 1219ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step1_3 = _mm_sub_epi16(in04, in11); 1220ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step1_4 = _mm_sub_epi16(in03, in12); 1221ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step1_5 = _mm_sub_epi16(in02, in13); 1222ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step1_6 = _mm_sub_epi16(in01, in14); 1223ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step1_7 = _mm_sub_epi16(in00, in15); 1224ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1225ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Work on the first eight values; fdct8_1d(input, even_results); 1226ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 1227ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Add/substract 1228ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i q0 = _mm_add_epi16(input0, input7); 1229ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i q1 = _mm_add_epi16(input1, input6); 1230ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i q2 = _mm_add_epi16(input2, input5); 1231ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i q3 = _mm_add_epi16(input3, input4); 1232ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i q4 = _mm_sub_epi16(input3, input4); 1233ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i q5 = _mm_sub_epi16(input2, input5); 1234ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i q6 = _mm_sub_epi16(input1, input6); 1235ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i q7 = _mm_sub_epi16(input0, input7); 1236ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Work on first four results 1237ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 1238ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Add/substract 1239ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i r0 = _mm_add_epi16(q0, q3); 1240ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i r1 = _mm_add_epi16(q1, q2); 1241ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i r2 = _mm_sub_epi16(q1, q2); 1242ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i r3 = _mm_sub_epi16(q0, q3); 1243ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Interleave to do the multiply by constants which gets us 1244ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // into 32 bits. 1245ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t0 = _mm_unpacklo_epi16(r0, r1); 1246ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t1 = _mm_unpackhi_epi16(r0, r1); 1247ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t2 = _mm_unpacklo_epi16(r2, r3); 1248ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t3 = _mm_unpackhi_epi16(r2, r3); 1249ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); 1250ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); 1251ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); 1252ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); 1253ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); 1254ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); 1255ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); 1256ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); 1257ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // dct_const_round_shift 1258ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1259ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1260ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1261ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1262ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 1263ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 1264ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 1265ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 1266ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1267ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1268ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1269ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1270ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 1271ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 1272ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 1273ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 1274ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Combine 1275ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res00 = _mm_packs_epi32(w0, w1); 1276ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res08 = _mm_packs_epi32(w2, w3); 1277ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res04 = _mm_packs_epi32(w4, w5); 1278ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res12 = _mm_packs_epi32(w6, w7); 1279ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1280ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Work on next four results 1281ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 1282ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Interleave to do the multiply by constants which gets us 1283ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // into 32 bits. 1284ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i d0 = _mm_unpacklo_epi16(q6, q5); 1285ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i d1 = _mm_unpackhi_epi16(q6, q5); 1286ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); 1287ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); 1288ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); 1289ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); 1290ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // dct_const_round_shift 1291ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); 1292ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); 1293ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); 1294ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); 1295ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); 1296ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); 1297ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); 1298ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); 1299ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Combine 1300ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i r0 = _mm_packs_epi32(s0, s1); 1301ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i r1 = _mm_packs_epi32(s2, s3); 1302ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Add/substract 1303ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i x0 = _mm_add_epi16(q4, r0); 1304ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i x1 = _mm_sub_epi16(q4, r0); 1305ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i x2 = _mm_sub_epi16(q7, r1); 1306ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i x3 = _mm_add_epi16(q7, r1); 1307ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Interleave to do the multiply by constants which gets us 1308ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // into 32 bits. 1309ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t0 = _mm_unpacklo_epi16(x0, x3); 1310ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t1 = _mm_unpackhi_epi16(x0, x3); 1311ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t2 = _mm_unpacklo_epi16(x1, x2); 1312ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t3 = _mm_unpackhi_epi16(x1, x2); 1313ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); 1314ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); 1315ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); 1316ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); 1317ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); 1318ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); 1319ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); 1320ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); 1321ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // dct_const_round_shift 1322ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1323ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1324ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1325ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1326ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 1327ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 1328ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 1329ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 1330ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1331ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1332ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1333ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1334ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 1335ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 1336ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 1337ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 1338ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Combine 1339ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res02 = _mm_packs_epi32(w0, w1); 1340ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res14 = _mm_packs_epi32(w2, w3); 1341ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res10 = _mm_packs_epi32(w4, w5); 1342ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res06 = _mm_packs_epi32(w6, w7); 1343ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1344ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1345ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Work on the next eight values; step1 -> odd_results 1346ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 1347ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // step 2 1348ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 1349ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); 1350ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); 1351ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); 1352ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); 1353ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16); 1354ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16); 1355ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16); 1356ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16); 1357ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // dct_const_round_shift 1358ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1359ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1360ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1361ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1362ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1363ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1364ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1365ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1366ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Combine 1367ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step2_2 = _mm_packs_epi32(w0, w1); 1368ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step2_3 = _mm_packs_epi32(w2, w3); 1369ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1370ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 1371ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); 1372ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); 1373ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); 1374ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); 1375ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); 1376ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); 1377ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16); 1378ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16); 1379ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // dct_const_round_shift 1380ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1381ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1382ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1383ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1384ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1385ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1386ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1387ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1388ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Combine 1389ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step2_5 = _mm_packs_epi32(w0, w1); 1390ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step2_4 = _mm_packs_epi32(w2, w3); 1391ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1392ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // step 3 1393ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 1394ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step3_0 = _mm_add_epi16(step1_0, step2_3); 1395ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step3_1 = _mm_add_epi16(step1_1, step2_2); 1396ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step3_2 = _mm_sub_epi16(step1_1, step2_2); 1397ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step3_3 = _mm_sub_epi16(step1_0, step2_3); 1398ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step3_4 = _mm_sub_epi16(step1_7, step2_4); 1399ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step3_5 = _mm_sub_epi16(step1_6, step2_5); 1400ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step3_6 = _mm_add_epi16(step1_6, step2_5); 1401ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step3_7 = _mm_add_epi16(step1_7, step2_4); 1402ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1403ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // step 4 1404ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 1405ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); 1406ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); 1407ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); 1408ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); 1409ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24); 1410ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24); 1411ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08); 1412ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08); 1413ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // dct_const_round_shift 1414ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1415ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1416ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1417ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1418ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1419ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1420ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1421ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1422ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Combine 1423ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step2_1 = _mm_packs_epi32(w0, w1); 1424ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step2_2 = _mm_packs_epi32(w2, w3); 1425ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1426ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 1427ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); 1428ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); 1429ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); 1430ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); 1431ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08); 1432ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08); 1433ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24); 1434ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24); 1435ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // dct_const_round_shift 1436ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1437ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1438ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1439ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1440ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1441ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1442ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1443ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1444ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Combine 1445ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step2_6 = _mm_packs_epi32(w0, w1); 1446ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step2_5 = _mm_packs_epi32(w2, w3); 1447ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1448ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // step 5 1449ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 1450ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step1_0 = _mm_add_epi16(step3_0, step2_1); 1451ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step1_1 = _mm_sub_epi16(step3_0, step2_1); 1452ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step1_2 = _mm_sub_epi16(step3_3, step2_2); 1453ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step1_3 = _mm_add_epi16(step3_3, step2_2); 1454ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step1_4 = _mm_add_epi16(step3_4, step2_5); 1455ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step1_5 = _mm_sub_epi16(step3_4, step2_5); 1456ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step1_6 = _mm_sub_epi16(step3_7, step2_6); 1457ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang step1_7 = _mm_add_epi16(step3_7, step2_6); 1458ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1459ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // step 6 1460ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 1461ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); 1462ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); 1463ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); 1464ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); 1465ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02); 1466ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02); 1467ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18); 1468ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18); 1469ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // dct_const_round_shift 1470ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1471ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1472ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1473ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1474ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1475ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1476ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1477ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1478ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Combine 1479ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res01 = _mm_packs_epi32(w0, w1); 1480ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res09 = _mm_packs_epi32(w2, w3); 1481ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1482ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 1483ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); 1484ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); 1485ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); 1486ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); 1487ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10); 1488ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10); 1489ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26); 1490ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26); 1491ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // dct_const_round_shift 1492ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1493ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1494ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1495ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1496ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1497ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1498ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1499ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1500ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Combine 1501ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res05 = _mm_packs_epi32(w0, w1); 1502ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res13 = _mm_packs_epi32(w2, w3); 1503ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1504ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 1505ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); 1506ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); 1507ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); 1508ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); 1509ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22); 1510ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22); 1511ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06); 1512ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06); 1513ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // dct_const_round_shift 1514ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1515ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1516ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1517ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1518ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1519ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1520ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1521ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1522ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Combine 1523ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res11 = _mm_packs_epi32(w0, w1); 1524ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res03 = _mm_packs_epi32(w2, w3); 1525ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1526ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 1527ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); 1528ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); 1529ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); 1530ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); 1531ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30); 1532ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30); 1533ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14); 1534ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14); 1535ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // dct_const_round_shift 1536ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1537ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1538ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1539ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1540ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1541ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1542ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1543ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1544ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Combine 1545ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res15 = _mm_packs_epi32(w0, w1); 1546ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res07 = _mm_packs_epi32(w2, w3); 1547ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1548ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1549ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Transpose the results, do it as two 8x8 transposes. 1550ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 1551ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 00 01 02 03 04 05 06 07 1552ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 10 11 12 13 14 15 16 17 1553ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 20 21 22 23 24 25 26 27 1554ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 30 31 32 33 34 35 36 37 1555ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 40 41 42 43 44 45 46 47 1556ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 50 51 52 53 54 55 56 57 1557ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 60 61 62 63 64 65 66 67 1558ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 70 71 72 73 74 75 76 77 1559ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01); 1560ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03); 1561ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01); 1562ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03); 1563ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05); 1564ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07); 1565ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05); 1566ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07); 1567ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 00 10 01 11 02 12 03 13 1568ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 20 30 21 31 22 32 23 33 1569ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 04 14 05 15 06 16 07 17 1570ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 24 34 25 35 26 36 27 37 1571ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 40 50 41 51 42 52 43 53 1572ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 60 70 61 71 62 72 63 73 1573ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 54 54 55 55 56 56 57 57 1574ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 64 74 65 75 66 76 67 77 1575ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 1576ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); 1577ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 1578ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); 1579ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); 1580ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 1581ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); 1582ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 1583ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 00 10 20 30 01 11 21 31 1584ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 40 50 60 70 41 51 61 71 1585ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 02 12 22 32 03 13 23 33 1586ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 42 52 62 72 43 53 63 73 1587ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 04 14 24 34 05 15 21 36 1588ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 44 54 64 74 45 55 61 76 1589ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 06 16 26 36 07 17 27 37 1590ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 46 56 66 76 47 57 67 77 1591ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); 1592ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); 1593ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); 1594ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); 1595ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); 1596ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); 1597ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); 1598ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); 1599ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 00 10 20 30 40 50 60 70 1600ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 01 11 21 31 41 51 61 71 1601ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 02 12 22 32 42 52 62 72 1602ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 03 13 23 33 43 53 63 73 1603ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 04 14 24 34 44 54 64 74 1604ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 05 15 25 35 45 55 65 75 1605ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 06 16 26 36 46 56 66 76 1606ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 07 17 27 37 47 57 67 77 1607ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0); 1608ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1); 1609ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2); 1610ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3); 1611ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4); 1612ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5); 1613ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6); 1614ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7); 1615ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1616ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 1617ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 00 01 02 03 04 05 06 07 1618ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 10 11 12 13 14 15 16 17 1619ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 20 21 22 23 24 25 26 27 1620ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 30 31 32 33 34 35 36 37 1621ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 40 41 42 43 44 45 46 47 1622ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 50 51 52 53 54 55 56 57 1623ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 60 61 62 63 64 65 66 67 1624ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 70 71 72 73 74 75 76 77 1625ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09); 1626ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11); 1627ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09); 1628ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11); 1629ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13); 1630ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15); 1631ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13); 1632ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15); 1633ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 00 10 01 11 02 12 03 13 1634ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 20 30 21 31 22 32 23 33 1635ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 04 14 05 15 06 16 07 17 1636ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 24 34 25 35 26 36 27 37 1637ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 40 50 41 51 42 52 43 53 1638ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 60 70 61 71 62 72 63 73 1639ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 54 54 55 55 56 56 57 57 1640ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 64 74 65 75 66 76 67 77 1641ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 1642ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); 1643ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 1644ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); 1645ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); 1646ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 1647ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); 1648ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 1649ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 00 10 20 30 01 11 21 31 1650ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 40 50 60 70 41 51 61 71 1651ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 02 12 22 32 03 13 23 33 1652ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 42 52 62 72 43 53 63 73 1653ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 04 14 24 34 05 15 21 36 1654ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 44 54 64 74 45 55 61 76 1655ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 06 16 26 36 07 17 27 37 1656ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 46 56 66 76 47 57 67 77 1657ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); 1658ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); 1659ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); 1660ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); 1661ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); 1662ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); 1663ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); 1664ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); 1665ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 00 10 20 30 40 50 60 70 1666ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 01 11 21 31 41 51 61 71 1667ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 02 12 22 32 42 52 62 72 1668ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 03 13 23 33 43 53 63 73 1669ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 04 14 24 34 44 54 64 74 1670ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 05 15 25 35 45 55 65 75 1671ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 06 16 26 36 46 56 66 76 1672ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 07 17 27 37 47 57 67 77 1673ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Store results 167491037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_store_si128((__m128i *)(out + 8 + 0 * 16), tr2_0); 167591037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_store_si128((__m128i *)(out + 8 + 1 * 16), tr2_1); 167691037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_store_si128((__m128i *)(out + 8 + 2 * 16), tr2_2); 167791037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_store_si128((__m128i *)(out + 8 + 3 * 16), tr2_3); 167891037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_store_si128((__m128i *)(out + 8 + 4 * 16), tr2_4); 167991037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_store_si128((__m128i *)(out + 8 + 5 * 16), tr2_5); 168091037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_store_si128((__m128i *)(out + 8 + 6 * 16), tr2_6); 168191037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7); 1682ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1683ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out += 8*16; 1684ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1685ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Setup in/out for next pass. 1686ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in = intermediate; 1687ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out = output; 1688ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1689ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang} 169091037db265ecdd914a26e056cf69207b4f50924ehkuang 169191037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void load_buffer_16x16(int16_t* input, __m128i *in0, 169291037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i *in1, int stride) { 169391037db265ecdd914a26e056cf69207b4f50924ehkuang // load first 8 columns 169491037db265ecdd914a26e056cf69207b4f50924ehkuang load_buffer_8x8(input, in0, stride); 169591037db265ecdd914a26e056cf69207b4f50924ehkuang load_buffer_8x8(input + 8 * stride, in0 + 8, stride); 169691037db265ecdd914a26e056cf69207b4f50924ehkuang 169791037db265ecdd914a26e056cf69207b4f50924ehkuang input += 8; 169891037db265ecdd914a26e056cf69207b4f50924ehkuang // load second 8 columns 169991037db265ecdd914a26e056cf69207b4f50924ehkuang load_buffer_8x8(input, in1, stride); 170091037db265ecdd914a26e056cf69207b4f50924ehkuang load_buffer_8x8(input + 8 * stride, in1 + 8, stride); 170191037db265ecdd914a26e056cf69207b4f50924ehkuang} 170291037db265ecdd914a26e056cf69207b4f50924ehkuang 170391037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void write_buffer_16x16(int16_t *output, __m128i *in0, 170491037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i *in1, int stride) { 170591037db265ecdd914a26e056cf69207b4f50924ehkuang // write first 8 columns 170691037db265ecdd914a26e056cf69207b4f50924ehkuang write_buffer_8x8(output, in0, stride); 170791037db265ecdd914a26e056cf69207b4f50924ehkuang write_buffer_8x8(output + 8 * stride, in0 + 8, stride); 170891037db265ecdd914a26e056cf69207b4f50924ehkuang // write second 8 columns 170991037db265ecdd914a26e056cf69207b4f50924ehkuang output += 8; 171091037db265ecdd914a26e056cf69207b4f50924ehkuang write_buffer_8x8(output, in1, stride); 171191037db265ecdd914a26e056cf69207b4f50924ehkuang write_buffer_8x8(output + 8 * stride, in1 + 8, stride); 171291037db265ecdd914a26e056cf69207b4f50924ehkuang} 171391037db265ecdd914a26e056cf69207b4f50924ehkuang 171491037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { 171591037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i tbuf[8]; 171691037db265ecdd914a26e056cf69207b4f50924ehkuang array_transpose_8x8(res0, res0); 171791037db265ecdd914a26e056cf69207b4f50924ehkuang array_transpose_8x8(res1, tbuf); 171891037db265ecdd914a26e056cf69207b4f50924ehkuang array_transpose_8x8(res0 + 8, res1); 171991037db265ecdd914a26e056cf69207b4f50924ehkuang array_transpose_8x8(res1 + 8, res1 + 8); 172091037db265ecdd914a26e056cf69207b4f50924ehkuang 172191037db265ecdd914a26e056cf69207b4f50924ehkuang res0[8] = tbuf[0]; 172291037db265ecdd914a26e056cf69207b4f50924ehkuang res0[9] = tbuf[1]; 172391037db265ecdd914a26e056cf69207b4f50924ehkuang res0[10] = tbuf[2]; 172491037db265ecdd914a26e056cf69207b4f50924ehkuang res0[11] = tbuf[3]; 172591037db265ecdd914a26e056cf69207b4f50924ehkuang res0[12] = tbuf[4]; 172691037db265ecdd914a26e056cf69207b4f50924ehkuang res0[13] = tbuf[5]; 172791037db265ecdd914a26e056cf69207b4f50924ehkuang res0[14] = tbuf[6]; 172891037db265ecdd914a26e056cf69207b4f50924ehkuang res0[15] = tbuf[7]; 172991037db265ecdd914a26e056cf69207b4f50924ehkuang} 173091037db265ecdd914a26e056cf69207b4f50924ehkuang 173191037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) { 173291037db265ecdd914a26e056cf69207b4f50924ehkuang // perform rounding operations 173391037db265ecdd914a26e056cf69207b4f50924ehkuang right_shift_8x8(res0, 2); 173491037db265ecdd914a26e056cf69207b4f50924ehkuang right_shift_8x8(res0 + 8, 2); 173591037db265ecdd914a26e056cf69207b4f50924ehkuang right_shift_8x8(res1, 2); 173691037db265ecdd914a26e056cf69207b4f50924ehkuang right_shift_8x8(res1 + 8, 2); 173791037db265ecdd914a26e056cf69207b4f50924ehkuang} 173891037db265ecdd914a26e056cf69207b4f50924ehkuang 173991037db265ecdd914a26e056cf69207b4f50924ehkuangvoid fdct16_1d_8col(__m128i *in) { 174091037db265ecdd914a26e056cf69207b4f50924ehkuang // perform 16x16 1-D DCT for 8 columns 174191037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i i[8], s[8], p[8], t[8], u[16], v[16]; 174291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 174391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 174491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 174591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 174691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 174791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 174891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 174991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 175091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 175191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 175291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); 175391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); 175491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); 175591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); 175691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); 175791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); 175891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); 175991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); 176091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 176191037db265ecdd914a26e056cf69207b4f50924ehkuang 176291037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 1 176391037db265ecdd914a26e056cf69207b4f50924ehkuang i[0] = _mm_add_epi16(in[0], in[15]); 176491037db265ecdd914a26e056cf69207b4f50924ehkuang i[1] = _mm_add_epi16(in[1], in[14]); 176591037db265ecdd914a26e056cf69207b4f50924ehkuang i[2] = _mm_add_epi16(in[2], in[13]); 176691037db265ecdd914a26e056cf69207b4f50924ehkuang i[3] = _mm_add_epi16(in[3], in[12]); 176791037db265ecdd914a26e056cf69207b4f50924ehkuang i[4] = _mm_add_epi16(in[4], in[11]); 176891037db265ecdd914a26e056cf69207b4f50924ehkuang i[5] = _mm_add_epi16(in[5], in[10]); 176991037db265ecdd914a26e056cf69207b4f50924ehkuang i[6] = _mm_add_epi16(in[6], in[9]); 177091037db265ecdd914a26e056cf69207b4f50924ehkuang i[7] = _mm_add_epi16(in[7], in[8]); 177191037db265ecdd914a26e056cf69207b4f50924ehkuang 177291037db265ecdd914a26e056cf69207b4f50924ehkuang s[0] = _mm_sub_epi16(in[7], in[8]); 177391037db265ecdd914a26e056cf69207b4f50924ehkuang s[1] = _mm_sub_epi16(in[6], in[9]); 177491037db265ecdd914a26e056cf69207b4f50924ehkuang s[2] = _mm_sub_epi16(in[5], in[10]); 177591037db265ecdd914a26e056cf69207b4f50924ehkuang s[3] = _mm_sub_epi16(in[4], in[11]); 177691037db265ecdd914a26e056cf69207b4f50924ehkuang s[4] = _mm_sub_epi16(in[3], in[12]); 177791037db265ecdd914a26e056cf69207b4f50924ehkuang s[5] = _mm_sub_epi16(in[2], in[13]); 177891037db265ecdd914a26e056cf69207b4f50924ehkuang s[6] = _mm_sub_epi16(in[1], in[14]); 177991037db265ecdd914a26e056cf69207b4f50924ehkuang s[7] = _mm_sub_epi16(in[0], in[15]); 178091037db265ecdd914a26e056cf69207b4f50924ehkuang 178191037db265ecdd914a26e056cf69207b4f50924ehkuang p[0] = _mm_add_epi16(i[0], i[7]); 178291037db265ecdd914a26e056cf69207b4f50924ehkuang p[1] = _mm_add_epi16(i[1], i[6]); 178391037db265ecdd914a26e056cf69207b4f50924ehkuang p[2] = _mm_add_epi16(i[2], i[5]); 178491037db265ecdd914a26e056cf69207b4f50924ehkuang p[3] = _mm_add_epi16(i[3], i[4]); 178591037db265ecdd914a26e056cf69207b4f50924ehkuang p[4] = _mm_sub_epi16(i[3], i[4]); 178691037db265ecdd914a26e056cf69207b4f50924ehkuang p[5] = _mm_sub_epi16(i[2], i[5]); 178791037db265ecdd914a26e056cf69207b4f50924ehkuang p[6] = _mm_sub_epi16(i[1], i[6]); 178891037db265ecdd914a26e056cf69207b4f50924ehkuang p[7] = _mm_sub_epi16(i[0], i[7]); 178991037db265ecdd914a26e056cf69207b4f50924ehkuang 179091037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi16(p[0], p[3]); 179191037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi16(p[1], p[2]); 179291037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_sub_epi16(p[1], p[2]); 179391037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_sub_epi16(p[0], p[3]); 179491037db265ecdd914a26e056cf69207b4f50924ehkuang 179591037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_unpacklo_epi16(u[0], u[1]); 179691037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_unpackhi_epi16(u[0], u[1]); 179791037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_unpacklo_epi16(u[2], u[3]); 179891037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_unpackhi_epi16(u[2], u[3]); 179991037db265ecdd914a26e056cf69207b4f50924ehkuang 180091037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); 180191037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16); 180291037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16); 180391037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16); 180491037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08); 180591037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08); 180691037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24); 180791037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24); 180891037db265ecdd914a26e056cf69207b4f50924ehkuang 180991037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 181091037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 181191037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 181291037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 181391037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 181491037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 181591037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 181691037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 181791037db265ecdd914a26e056cf69207b4f50924ehkuang 181891037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 181991037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 182091037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 182191037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 182291037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 182391037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 182491037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 182591037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 182691037db265ecdd914a26e056cf69207b4f50924ehkuang 182791037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_packs_epi32(u[0], u[1]); 182891037db265ecdd914a26e056cf69207b4f50924ehkuang in[4] = _mm_packs_epi32(u[4], u[5]); 182991037db265ecdd914a26e056cf69207b4f50924ehkuang in[8] = _mm_packs_epi32(u[2], u[3]); 183091037db265ecdd914a26e056cf69207b4f50924ehkuang in[12] = _mm_packs_epi32(u[6], u[7]); 183191037db265ecdd914a26e056cf69207b4f50924ehkuang 183291037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_unpacklo_epi16(p[5], p[6]); 183391037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_unpackhi_epi16(p[5], p[6]); 183491037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 183591037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 183691037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 183791037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 183891037db265ecdd914a26e056cf69207b4f50924ehkuang 183991037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 184091037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 184191037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 184291037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 184391037db265ecdd914a26e056cf69207b4f50924ehkuang 184491037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 184591037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 184691037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 184791037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 184891037db265ecdd914a26e056cf69207b4f50924ehkuang 184991037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_packs_epi32(v[0], v[1]); 185091037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_packs_epi32(v[2], v[3]); 185191037db265ecdd914a26e056cf69207b4f50924ehkuang 185291037db265ecdd914a26e056cf69207b4f50924ehkuang t[0] = _mm_add_epi16(p[4], u[0]); 185391037db265ecdd914a26e056cf69207b4f50924ehkuang t[1] = _mm_sub_epi16(p[4], u[0]); 185491037db265ecdd914a26e056cf69207b4f50924ehkuang t[2] = _mm_sub_epi16(p[7], u[1]); 185591037db265ecdd914a26e056cf69207b4f50924ehkuang t[3] = _mm_add_epi16(p[7], u[1]); 185691037db265ecdd914a26e056cf69207b4f50924ehkuang 185791037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_unpacklo_epi16(t[0], t[3]); 185891037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_unpackhi_epi16(t[0], t[3]); 185991037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_unpacklo_epi16(t[1], t[2]); 186091037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_unpackhi_epi16(t[1], t[2]); 186191037db265ecdd914a26e056cf69207b4f50924ehkuang 186291037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04); 186391037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04); 186491037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20); 186591037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20); 186691037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12); 186791037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12); 186891037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28); 186991037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28); 187091037db265ecdd914a26e056cf69207b4f50924ehkuang 187191037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 187291037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 187391037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 187491037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 187591037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 187691037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 187791037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 187891037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 187991037db265ecdd914a26e056cf69207b4f50924ehkuang 188091037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 188191037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 188291037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 188391037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 188491037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 188591037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 188691037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 188791037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 188891037db265ecdd914a26e056cf69207b4f50924ehkuang 188991037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = _mm_packs_epi32(v[0], v[1]); 189091037db265ecdd914a26e056cf69207b4f50924ehkuang in[6] = _mm_packs_epi32(v[4], v[5]); 189191037db265ecdd914a26e056cf69207b4f50924ehkuang in[10] = _mm_packs_epi32(v[2], v[3]); 189291037db265ecdd914a26e056cf69207b4f50924ehkuang in[14] = _mm_packs_epi32(v[6], v[7]); 189391037db265ecdd914a26e056cf69207b4f50924ehkuang 189491037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 2 189591037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_unpacklo_epi16(s[2], s[5]); 189691037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_unpackhi_epi16(s[2], s[5]); 189791037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_unpacklo_epi16(s[3], s[4]); 189891037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_unpackhi_epi16(s[3], s[4]); 189991037db265ecdd914a26e056cf69207b4f50924ehkuang 190091037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 190191037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 190291037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 190391037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 190491037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 190591037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 190691037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 190791037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 190891037db265ecdd914a26e056cf69207b4f50924ehkuang 190991037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 191091037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 191191037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 191291037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 191391037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 191491037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 191591037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 191691037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 191791037db265ecdd914a26e056cf69207b4f50924ehkuang 191891037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 191991037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 192091037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 192191037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 192291037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 192391037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 192491037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 192591037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 192691037db265ecdd914a26e056cf69207b4f50924ehkuang 192791037db265ecdd914a26e056cf69207b4f50924ehkuang t[2] = _mm_packs_epi32(v[0], v[1]); 192891037db265ecdd914a26e056cf69207b4f50924ehkuang t[3] = _mm_packs_epi32(v[2], v[3]); 192991037db265ecdd914a26e056cf69207b4f50924ehkuang t[4] = _mm_packs_epi32(v[4], v[5]); 193091037db265ecdd914a26e056cf69207b4f50924ehkuang t[5] = _mm_packs_epi32(v[6], v[7]); 193191037db265ecdd914a26e056cf69207b4f50924ehkuang 193291037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 3 193391037db265ecdd914a26e056cf69207b4f50924ehkuang p[0] = _mm_add_epi16(s[0], t[3]); 193491037db265ecdd914a26e056cf69207b4f50924ehkuang p[1] = _mm_add_epi16(s[1], t[2]); 193591037db265ecdd914a26e056cf69207b4f50924ehkuang p[2] = _mm_sub_epi16(s[1], t[2]); 193691037db265ecdd914a26e056cf69207b4f50924ehkuang p[3] = _mm_sub_epi16(s[0], t[3]); 193791037db265ecdd914a26e056cf69207b4f50924ehkuang p[4] = _mm_sub_epi16(s[7], t[4]); 193891037db265ecdd914a26e056cf69207b4f50924ehkuang p[5] = _mm_sub_epi16(s[6], t[5]); 193991037db265ecdd914a26e056cf69207b4f50924ehkuang p[6] = _mm_add_epi16(s[6], t[5]); 194091037db265ecdd914a26e056cf69207b4f50924ehkuang p[7] = _mm_add_epi16(s[7], t[4]); 194191037db265ecdd914a26e056cf69207b4f50924ehkuang 194291037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 4 194391037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_unpacklo_epi16(p[1], p[6]); 194491037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_unpackhi_epi16(p[1], p[6]); 194591037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_unpacklo_epi16(p[2], p[5]); 194691037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_unpackhi_epi16(p[2], p[5]); 194791037db265ecdd914a26e056cf69207b4f50924ehkuang 194891037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24); 194991037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24); 195091037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08); 195191037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08); 195291037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24); 195391037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24); 195491037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08); 195591037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08); 195691037db265ecdd914a26e056cf69207b4f50924ehkuang 195791037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 195891037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 195991037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 196091037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 196191037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 196291037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 196391037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 196491037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 196591037db265ecdd914a26e056cf69207b4f50924ehkuang 196691037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 196791037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 196891037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 196991037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 197091037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 197191037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 197291037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 197391037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 197491037db265ecdd914a26e056cf69207b4f50924ehkuang 197591037db265ecdd914a26e056cf69207b4f50924ehkuang t[1] = _mm_packs_epi32(v[0], v[1]); 197691037db265ecdd914a26e056cf69207b4f50924ehkuang t[2] = _mm_packs_epi32(v[2], v[3]); 197791037db265ecdd914a26e056cf69207b4f50924ehkuang t[5] = _mm_packs_epi32(v[4], v[5]); 197891037db265ecdd914a26e056cf69207b4f50924ehkuang t[6] = _mm_packs_epi32(v[6], v[7]); 197991037db265ecdd914a26e056cf69207b4f50924ehkuang 198091037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 5 198191037db265ecdd914a26e056cf69207b4f50924ehkuang s[0] = _mm_add_epi16(p[0], t[1]); 198291037db265ecdd914a26e056cf69207b4f50924ehkuang s[1] = _mm_sub_epi16(p[0], t[1]); 198391037db265ecdd914a26e056cf69207b4f50924ehkuang s[2] = _mm_sub_epi16(p[3], t[2]); 198491037db265ecdd914a26e056cf69207b4f50924ehkuang s[3] = _mm_add_epi16(p[3], t[2]); 198591037db265ecdd914a26e056cf69207b4f50924ehkuang s[4] = _mm_add_epi16(p[4], t[5]); 198691037db265ecdd914a26e056cf69207b4f50924ehkuang s[5] = _mm_sub_epi16(p[4], t[5]); 198791037db265ecdd914a26e056cf69207b4f50924ehkuang s[6] = _mm_sub_epi16(p[7], t[6]); 198891037db265ecdd914a26e056cf69207b4f50924ehkuang s[7] = _mm_add_epi16(p[7], t[6]); 198991037db265ecdd914a26e056cf69207b4f50924ehkuang 199091037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 6 199191037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_unpacklo_epi16(s[0], s[7]); 199291037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_unpackhi_epi16(s[0], s[7]); 199391037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_unpacklo_epi16(s[1], s[6]); 199491037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_unpackhi_epi16(s[1], s[6]); 199591037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_unpacklo_epi16(s[2], s[5]); 199691037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_unpackhi_epi16(s[2], s[5]); 199791037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_unpacklo_epi16(s[3], s[4]); 199891037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_unpackhi_epi16(s[3], s[4]); 199991037db265ecdd914a26e056cf69207b4f50924ehkuang 200091037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02); 200191037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02); 200291037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18); 200391037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18); 200491037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10); 200591037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10); 200691037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26); 200791037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26); 200891037db265ecdd914a26e056cf69207b4f50924ehkuang v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06); 200991037db265ecdd914a26e056cf69207b4f50924ehkuang v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06); 201091037db265ecdd914a26e056cf69207b4f50924ehkuang v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22); 201191037db265ecdd914a26e056cf69207b4f50924ehkuang v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22); 201291037db265ecdd914a26e056cf69207b4f50924ehkuang v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14); 201391037db265ecdd914a26e056cf69207b4f50924ehkuang v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14); 201491037db265ecdd914a26e056cf69207b4f50924ehkuang v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30); 201591037db265ecdd914a26e056cf69207b4f50924ehkuang v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30); 201691037db265ecdd914a26e056cf69207b4f50924ehkuang 201791037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 201891037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 201991037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 202091037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 202191037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 202291037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 202391037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 202491037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 202591037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 202691037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 202791037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 202891037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 202991037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 203091037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 203191037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 203291037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 203391037db265ecdd914a26e056cf69207b4f50924ehkuang 203491037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 203591037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 203691037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 203791037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 203891037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 203991037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 204091037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 204191037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 204291037db265ecdd914a26e056cf69207b4f50924ehkuang v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 204391037db265ecdd914a26e056cf69207b4f50924ehkuang v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 204491037db265ecdd914a26e056cf69207b4f50924ehkuang v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 204591037db265ecdd914a26e056cf69207b4f50924ehkuang v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 204691037db265ecdd914a26e056cf69207b4f50924ehkuang v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 204791037db265ecdd914a26e056cf69207b4f50924ehkuang v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 204891037db265ecdd914a26e056cf69207b4f50924ehkuang v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 204991037db265ecdd914a26e056cf69207b4f50924ehkuang v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 205091037db265ecdd914a26e056cf69207b4f50924ehkuang 205191037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_packs_epi32(v[0], v[1]); 205291037db265ecdd914a26e056cf69207b4f50924ehkuang in[9] = _mm_packs_epi32(v[2], v[3]); 205391037db265ecdd914a26e056cf69207b4f50924ehkuang in[5] = _mm_packs_epi32(v[4], v[5]); 205491037db265ecdd914a26e056cf69207b4f50924ehkuang in[13] = _mm_packs_epi32(v[6], v[7]); 205591037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_packs_epi32(v[8], v[9]); 205691037db265ecdd914a26e056cf69207b4f50924ehkuang in[11] = _mm_packs_epi32(v[10], v[11]); 205791037db265ecdd914a26e056cf69207b4f50924ehkuang in[7] = _mm_packs_epi32(v[12], v[13]); 205891037db265ecdd914a26e056cf69207b4f50924ehkuang in[15] = _mm_packs_epi32(v[14], v[15]); 205991037db265ecdd914a26e056cf69207b4f50924ehkuang} 206091037db265ecdd914a26e056cf69207b4f50924ehkuang 206191037db265ecdd914a26e056cf69207b4f50924ehkuangvoid fadst16_1d_8col(__m128i *in) { 206291037db265ecdd914a26e056cf69207b4f50924ehkuang // perform 16x16 1-D ADST for 8 columns 206391037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s[16], x[16], u[32], v[32]; 206491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); 206591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); 206691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); 206791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); 206891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); 206991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); 207091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); 207191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); 207291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); 207391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); 207491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); 207591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); 207691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); 207791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); 207891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); 207991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); 208091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); 208191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); 208291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); 208391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); 208491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); 208591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); 208691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 208791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 208891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 208991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); 209091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 209191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 209291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 209391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 209491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i kZero = _mm_set1_epi16(0); 209591037db265ecdd914a26e056cf69207b4f50924ehkuang 209691037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_unpacklo_epi16(in[15], in[0]); 209791037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_unpackhi_epi16(in[15], in[0]); 209891037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_unpacklo_epi16(in[13], in[2]); 209991037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_unpackhi_epi16(in[13], in[2]); 210091037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_unpacklo_epi16(in[11], in[4]); 210191037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_unpackhi_epi16(in[11], in[4]); 210291037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_unpacklo_epi16(in[9], in[6]); 210391037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_unpackhi_epi16(in[9], in[6]); 210491037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_unpacklo_epi16(in[7], in[8]); 210591037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_unpackhi_epi16(in[7], in[8]); 210691037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_unpacklo_epi16(in[5], in[10]); 210791037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_unpackhi_epi16(in[5], in[10]); 210891037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_unpacklo_epi16(in[3], in[12]); 210991037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_unpackhi_epi16(in[3], in[12]); 211091037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_unpacklo_epi16(in[1], in[14]); 211191037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_unpackhi_epi16(in[1], in[14]); 211291037db265ecdd914a26e056cf69207b4f50924ehkuang 211391037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); 211491037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); 211591037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); 211691037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); 211791037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); 211891037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); 211991037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); 212091037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); 212191037db265ecdd914a26e056cf69207b4f50924ehkuang v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); 212291037db265ecdd914a26e056cf69207b4f50924ehkuang v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); 212391037db265ecdd914a26e056cf69207b4f50924ehkuang v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); 212491037db265ecdd914a26e056cf69207b4f50924ehkuang v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); 212591037db265ecdd914a26e056cf69207b4f50924ehkuang v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); 212691037db265ecdd914a26e056cf69207b4f50924ehkuang v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); 212791037db265ecdd914a26e056cf69207b4f50924ehkuang v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); 212891037db265ecdd914a26e056cf69207b4f50924ehkuang v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); 212991037db265ecdd914a26e056cf69207b4f50924ehkuang v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); 213091037db265ecdd914a26e056cf69207b4f50924ehkuang v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); 213191037db265ecdd914a26e056cf69207b4f50924ehkuang v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); 213291037db265ecdd914a26e056cf69207b4f50924ehkuang v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); 213391037db265ecdd914a26e056cf69207b4f50924ehkuang v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); 213491037db265ecdd914a26e056cf69207b4f50924ehkuang v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); 213591037db265ecdd914a26e056cf69207b4f50924ehkuang v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); 213691037db265ecdd914a26e056cf69207b4f50924ehkuang v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); 213791037db265ecdd914a26e056cf69207b4f50924ehkuang v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); 213891037db265ecdd914a26e056cf69207b4f50924ehkuang v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); 213991037db265ecdd914a26e056cf69207b4f50924ehkuang v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); 214091037db265ecdd914a26e056cf69207b4f50924ehkuang v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); 214191037db265ecdd914a26e056cf69207b4f50924ehkuang v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); 214291037db265ecdd914a26e056cf69207b4f50924ehkuang v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); 214391037db265ecdd914a26e056cf69207b4f50924ehkuang v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); 214491037db265ecdd914a26e056cf69207b4f50924ehkuang v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); 214591037db265ecdd914a26e056cf69207b4f50924ehkuang 214691037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], v[16]); 214791037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(v[1], v[17]); 214891037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(v[2], v[18]); 214991037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(v[3], v[19]); 215091037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_add_epi32(v[4], v[20]); 215191037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_add_epi32(v[5], v[21]); 215291037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_add_epi32(v[6], v[22]); 215391037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_add_epi32(v[7], v[23]); 215491037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_add_epi32(v[8], v[24]); 215591037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_add_epi32(v[9], v[25]); 215691037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_add_epi32(v[10], v[26]); 215791037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_add_epi32(v[11], v[27]); 215891037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_add_epi32(v[12], v[28]); 215991037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_add_epi32(v[13], v[29]); 216091037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_add_epi32(v[14], v[30]); 216191037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_add_epi32(v[15], v[31]); 216291037db265ecdd914a26e056cf69207b4f50924ehkuang u[16] = _mm_sub_epi32(v[0], v[16]); 216391037db265ecdd914a26e056cf69207b4f50924ehkuang u[17] = _mm_sub_epi32(v[1], v[17]); 216491037db265ecdd914a26e056cf69207b4f50924ehkuang u[18] = _mm_sub_epi32(v[2], v[18]); 216591037db265ecdd914a26e056cf69207b4f50924ehkuang u[19] = _mm_sub_epi32(v[3], v[19]); 216691037db265ecdd914a26e056cf69207b4f50924ehkuang u[20] = _mm_sub_epi32(v[4], v[20]); 216791037db265ecdd914a26e056cf69207b4f50924ehkuang u[21] = _mm_sub_epi32(v[5], v[21]); 216891037db265ecdd914a26e056cf69207b4f50924ehkuang u[22] = _mm_sub_epi32(v[6], v[22]); 216991037db265ecdd914a26e056cf69207b4f50924ehkuang u[23] = _mm_sub_epi32(v[7], v[23]); 217091037db265ecdd914a26e056cf69207b4f50924ehkuang u[24] = _mm_sub_epi32(v[8], v[24]); 217191037db265ecdd914a26e056cf69207b4f50924ehkuang u[25] = _mm_sub_epi32(v[9], v[25]); 217291037db265ecdd914a26e056cf69207b4f50924ehkuang u[26] = _mm_sub_epi32(v[10], v[26]); 217391037db265ecdd914a26e056cf69207b4f50924ehkuang u[27] = _mm_sub_epi32(v[11], v[27]); 217491037db265ecdd914a26e056cf69207b4f50924ehkuang u[28] = _mm_sub_epi32(v[12], v[28]); 217591037db265ecdd914a26e056cf69207b4f50924ehkuang u[29] = _mm_sub_epi32(v[13], v[29]); 217691037db265ecdd914a26e056cf69207b4f50924ehkuang u[30] = _mm_sub_epi32(v[14], v[30]); 217791037db265ecdd914a26e056cf69207b4f50924ehkuang u[31] = _mm_sub_epi32(v[15], v[31]); 217891037db265ecdd914a26e056cf69207b4f50924ehkuang 217991037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 218091037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 218191037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 218291037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 218391037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 218491037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 218591037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 218691037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 218791037db265ecdd914a26e056cf69207b4f50924ehkuang v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 218891037db265ecdd914a26e056cf69207b4f50924ehkuang v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 218991037db265ecdd914a26e056cf69207b4f50924ehkuang v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 219091037db265ecdd914a26e056cf69207b4f50924ehkuang v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 219191037db265ecdd914a26e056cf69207b4f50924ehkuang v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 219291037db265ecdd914a26e056cf69207b4f50924ehkuang v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 219391037db265ecdd914a26e056cf69207b4f50924ehkuang v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 219491037db265ecdd914a26e056cf69207b4f50924ehkuang v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 219591037db265ecdd914a26e056cf69207b4f50924ehkuang v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); 219691037db265ecdd914a26e056cf69207b4f50924ehkuang v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); 219791037db265ecdd914a26e056cf69207b4f50924ehkuang v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); 219891037db265ecdd914a26e056cf69207b4f50924ehkuang v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); 219991037db265ecdd914a26e056cf69207b4f50924ehkuang v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); 220091037db265ecdd914a26e056cf69207b4f50924ehkuang v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); 220191037db265ecdd914a26e056cf69207b4f50924ehkuang v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); 220291037db265ecdd914a26e056cf69207b4f50924ehkuang v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); 220391037db265ecdd914a26e056cf69207b4f50924ehkuang v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); 220491037db265ecdd914a26e056cf69207b4f50924ehkuang v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); 220591037db265ecdd914a26e056cf69207b4f50924ehkuang v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); 220691037db265ecdd914a26e056cf69207b4f50924ehkuang v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); 220791037db265ecdd914a26e056cf69207b4f50924ehkuang v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); 220891037db265ecdd914a26e056cf69207b4f50924ehkuang v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); 220991037db265ecdd914a26e056cf69207b4f50924ehkuang v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); 221091037db265ecdd914a26e056cf69207b4f50924ehkuang v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); 221191037db265ecdd914a26e056cf69207b4f50924ehkuang 221291037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 221391037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 221491037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 221591037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 221691037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 221791037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 221891037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 221991037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 222091037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 222191037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 222291037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 222391037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 222491037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 222591037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 222691037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 222791037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 222891037db265ecdd914a26e056cf69207b4f50924ehkuang u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); 222991037db265ecdd914a26e056cf69207b4f50924ehkuang u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); 223091037db265ecdd914a26e056cf69207b4f50924ehkuang u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); 223191037db265ecdd914a26e056cf69207b4f50924ehkuang u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); 223291037db265ecdd914a26e056cf69207b4f50924ehkuang u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); 223391037db265ecdd914a26e056cf69207b4f50924ehkuang u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); 223491037db265ecdd914a26e056cf69207b4f50924ehkuang u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); 223591037db265ecdd914a26e056cf69207b4f50924ehkuang u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); 223691037db265ecdd914a26e056cf69207b4f50924ehkuang u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); 223791037db265ecdd914a26e056cf69207b4f50924ehkuang u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); 223891037db265ecdd914a26e056cf69207b4f50924ehkuang u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); 223991037db265ecdd914a26e056cf69207b4f50924ehkuang u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); 224091037db265ecdd914a26e056cf69207b4f50924ehkuang u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); 224191037db265ecdd914a26e056cf69207b4f50924ehkuang u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); 224291037db265ecdd914a26e056cf69207b4f50924ehkuang u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); 224391037db265ecdd914a26e056cf69207b4f50924ehkuang u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); 224491037db265ecdd914a26e056cf69207b4f50924ehkuang 224591037db265ecdd914a26e056cf69207b4f50924ehkuang s[0] = _mm_packs_epi32(u[0], u[1]); 224691037db265ecdd914a26e056cf69207b4f50924ehkuang s[1] = _mm_packs_epi32(u[2], u[3]); 224791037db265ecdd914a26e056cf69207b4f50924ehkuang s[2] = _mm_packs_epi32(u[4], u[5]); 224891037db265ecdd914a26e056cf69207b4f50924ehkuang s[3] = _mm_packs_epi32(u[6], u[7]); 224991037db265ecdd914a26e056cf69207b4f50924ehkuang s[4] = _mm_packs_epi32(u[8], u[9]); 225091037db265ecdd914a26e056cf69207b4f50924ehkuang s[5] = _mm_packs_epi32(u[10], u[11]); 225191037db265ecdd914a26e056cf69207b4f50924ehkuang s[6] = _mm_packs_epi32(u[12], u[13]); 225291037db265ecdd914a26e056cf69207b4f50924ehkuang s[7] = _mm_packs_epi32(u[14], u[15]); 225391037db265ecdd914a26e056cf69207b4f50924ehkuang s[8] = _mm_packs_epi32(u[16], u[17]); 225491037db265ecdd914a26e056cf69207b4f50924ehkuang s[9] = _mm_packs_epi32(u[18], u[19]); 225591037db265ecdd914a26e056cf69207b4f50924ehkuang s[10] = _mm_packs_epi32(u[20], u[21]); 225691037db265ecdd914a26e056cf69207b4f50924ehkuang s[11] = _mm_packs_epi32(u[22], u[23]); 225791037db265ecdd914a26e056cf69207b4f50924ehkuang s[12] = _mm_packs_epi32(u[24], u[25]); 225891037db265ecdd914a26e056cf69207b4f50924ehkuang s[13] = _mm_packs_epi32(u[26], u[27]); 225991037db265ecdd914a26e056cf69207b4f50924ehkuang s[14] = _mm_packs_epi32(u[28], u[29]); 226091037db265ecdd914a26e056cf69207b4f50924ehkuang s[15] = _mm_packs_epi32(u[30], u[31]); 226191037db265ecdd914a26e056cf69207b4f50924ehkuang 226291037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 2 226391037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_unpacklo_epi16(s[8], s[9]); 226491037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_unpackhi_epi16(s[8], s[9]); 226591037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_unpacklo_epi16(s[10], s[11]); 226691037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_unpackhi_epi16(s[10], s[11]); 226791037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_unpacklo_epi16(s[12], s[13]); 226891037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_unpackhi_epi16(s[12], s[13]); 226991037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_unpacklo_epi16(s[14], s[15]); 227091037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_unpackhi_epi16(s[14], s[15]); 227191037db265ecdd914a26e056cf69207b4f50924ehkuang 227291037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); 227391037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); 227491037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); 227591037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); 227691037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); 227791037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); 227891037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); 227991037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); 228091037db265ecdd914a26e056cf69207b4f50924ehkuang v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); 228191037db265ecdd914a26e056cf69207b4f50924ehkuang v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); 228291037db265ecdd914a26e056cf69207b4f50924ehkuang v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); 228391037db265ecdd914a26e056cf69207b4f50924ehkuang v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); 228491037db265ecdd914a26e056cf69207b4f50924ehkuang v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); 228591037db265ecdd914a26e056cf69207b4f50924ehkuang v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); 228691037db265ecdd914a26e056cf69207b4f50924ehkuang v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); 228791037db265ecdd914a26e056cf69207b4f50924ehkuang v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); 228891037db265ecdd914a26e056cf69207b4f50924ehkuang 228991037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], v[8]); 229091037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(v[1], v[9]); 229191037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(v[2], v[10]); 229291037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(v[3], v[11]); 229391037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_add_epi32(v[4], v[12]); 229491037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_add_epi32(v[5], v[13]); 229591037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_add_epi32(v[6], v[14]); 229691037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_add_epi32(v[7], v[15]); 229791037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_sub_epi32(v[0], v[8]); 229891037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_sub_epi32(v[1], v[9]); 229991037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_sub_epi32(v[2], v[10]); 230091037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_sub_epi32(v[3], v[11]); 230191037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_sub_epi32(v[4], v[12]); 230291037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_sub_epi32(v[5], v[13]); 230391037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_sub_epi32(v[6], v[14]); 230491037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_sub_epi32(v[7], v[15]); 230591037db265ecdd914a26e056cf69207b4f50924ehkuang 230691037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 230791037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 230891037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 230991037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 231091037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 231191037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 231291037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 231391037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 231491037db265ecdd914a26e056cf69207b4f50924ehkuang v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 231591037db265ecdd914a26e056cf69207b4f50924ehkuang v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 231691037db265ecdd914a26e056cf69207b4f50924ehkuang v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 231791037db265ecdd914a26e056cf69207b4f50924ehkuang v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 231891037db265ecdd914a26e056cf69207b4f50924ehkuang v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 231991037db265ecdd914a26e056cf69207b4f50924ehkuang v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 232091037db265ecdd914a26e056cf69207b4f50924ehkuang v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 232191037db265ecdd914a26e056cf69207b4f50924ehkuang v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 232291037db265ecdd914a26e056cf69207b4f50924ehkuang 232391037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 232491037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 232591037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 232691037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 232791037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 232891037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 232991037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 233091037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 233191037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 233291037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 233391037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 233491037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 233591037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 233691037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 233791037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 233891037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 233991037db265ecdd914a26e056cf69207b4f50924ehkuang 234091037db265ecdd914a26e056cf69207b4f50924ehkuang x[0] = _mm_add_epi16(s[0], s[4]); 234191037db265ecdd914a26e056cf69207b4f50924ehkuang x[1] = _mm_add_epi16(s[1], s[5]); 234291037db265ecdd914a26e056cf69207b4f50924ehkuang x[2] = _mm_add_epi16(s[2], s[6]); 234391037db265ecdd914a26e056cf69207b4f50924ehkuang x[3] = _mm_add_epi16(s[3], s[7]); 234491037db265ecdd914a26e056cf69207b4f50924ehkuang x[4] = _mm_sub_epi16(s[0], s[4]); 234591037db265ecdd914a26e056cf69207b4f50924ehkuang x[5] = _mm_sub_epi16(s[1], s[5]); 234691037db265ecdd914a26e056cf69207b4f50924ehkuang x[6] = _mm_sub_epi16(s[2], s[6]); 234791037db265ecdd914a26e056cf69207b4f50924ehkuang x[7] = _mm_sub_epi16(s[3], s[7]); 234891037db265ecdd914a26e056cf69207b4f50924ehkuang x[8] = _mm_packs_epi32(u[0], u[1]); 234991037db265ecdd914a26e056cf69207b4f50924ehkuang x[9] = _mm_packs_epi32(u[2], u[3]); 235091037db265ecdd914a26e056cf69207b4f50924ehkuang x[10] = _mm_packs_epi32(u[4], u[5]); 235191037db265ecdd914a26e056cf69207b4f50924ehkuang x[11] = _mm_packs_epi32(u[6], u[7]); 235291037db265ecdd914a26e056cf69207b4f50924ehkuang x[12] = _mm_packs_epi32(u[8], u[9]); 235391037db265ecdd914a26e056cf69207b4f50924ehkuang x[13] = _mm_packs_epi32(u[10], u[11]); 235491037db265ecdd914a26e056cf69207b4f50924ehkuang x[14] = _mm_packs_epi32(u[12], u[13]); 235591037db265ecdd914a26e056cf69207b4f50924ehkuang x[15] = _mm_packs_epi32(u[14], u[15]); 235691037db265ecdd914a26e056cf69207b4f50924ehkuang 235791037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 3 235891037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_unpacklo_epi16(x[4], x[5]); 235991037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_unpackhi_epi16(x[4], x[5]); 236091037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_unpacklo_epi16(x[6], x[7]); 236191037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_unpackhi_epi16(x[6], x[7]); 236291037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_unpacklo_epi16(x[12], x[13]); 236391037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_unpackhi_epi16(x[12], x[13]); 236491037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_unpacklo_epi16(x[14], x[15]); 236591037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_unpackhi_epi16(x[14], x[15]); 236691037db265ecdd914a26e056cf69207b4f50924ehkuang 236791037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); 236891037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); 236991037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); 237091037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); 237191037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); 237291037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); 237391037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); 237491037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); 237591037db265ecdd914a26e056cf69207b4f50924ehkuang v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); 237691037db265ecdd914a26e056cf69207b4f50924ehkuang v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); 237791037db265ecdd914a26e056cf69207b4f50924ehkuang v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); 237891037db265ecdd914a26e056cf69207b4f50924ehkuang v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); 237991037db265ecdd914a26e056cf69207b4f50924ehkuang v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); 238091037db265ecdd914a26e056cf69207b4f50924ehkuang v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); 238191037db265ecdd914a26e056cf69207b4f50924ehkuang v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); 238291037db265ecdd914a26e056cf69207b4f50924ehkuang v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); 238391037db265ecdd914a26e056cf69207b4f50924ehkuang 238491037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], v[4]); 238591037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(v[1], v[5]); 238691037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(v[2], v[6]); 238791037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(v[3], v[7]); 238891037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_sub_epi32(v[0], v[4]); 238991037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_sub_epi32(v[1], v[5]); 239091037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_sub_epi32(v[2], v[6]); 239191037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_sub_epi32(v[3], v[7]); 239291037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_add_epi32(v[8], v[12]); 239391037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_add_epi32(v[9], v[13]); 239491037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_add_epi32(v[10], v[14]); 239591037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_add_epi32(v[11], v[15]); 239691037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_sub_epi32(v[8], v[12]); 239791037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_sub_epi32(v[9], v[13]); 239891037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_sub_epi32(v[10], v[14]); 239991037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_sub_epi32(v[11], v[15]); 240091037db265ecdd914a26e056cf69207b4f50924ehkuang 240191037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 240291037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 240391037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 240491037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 240591037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 240691037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 240791037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 240891037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 240991037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 241091037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 241191037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 241291037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 241391037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 241491037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 241591037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 241691037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 241791037db265ecdd914a26e056cf69207b4f50924ehkuang 241891037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 241991037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 242091037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 242191037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 242291037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 242391037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 242491037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 242591037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 242691037db265ecdd914a26e056cf69207b4f50924ehkuang v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 242791037db265ecdd914a26e056cf69207b4f50924ehkuang v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 242891037db265ecdd914a26e056cf69207b4f50924ehkuang v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 242991037db265ecdd914a26e056cf69207b4f50924ehkuang v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 243091037db265ecdd914a26e056cf69207b4f50924ehkuang v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 243191037db265ecdd914a26e056cf69207b4f50924ehkuang v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 243291037db265ecdd914a26e056cf69207b4f50924ehkuang v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 243391037db265ecdd914a26e056cf69207b4f50924ehkuang v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 243491037db265ecdd914a26e056cf69207b4f50924ehkuang 243591037db265ecdd914a26e056cf69207b4f50924ehkuang s[0] = _mm_add_epi16(x[0], x[2]); 243691037db265ecdd914a26e056cf69207b4f50924ehkuang s[1] = _mm_add_epi16(x[1], x[3]); 243791037db265ecdd914a26e056cf69207b4f50924ehkuang s[2] = _mm_sub_epi16(x[0], x[2]); 243891037db265ecdd914a26e056cf69207b4f50924ehkuang s[3] = _mm_sub_epi16(x[1], x[3]); 243991037db265ecdd914a26e056cf69207b4f50924ehkuang s[4] = _mm_packs_epi32(v[0], v[1]); 244091037db265ecdd914a26e056cf69207b4f50924ehkuang s[5] = _mm_packs_epi32(v[2], v[3]); 244191037db265ecdd914a26e056cf69207b4f50924ehkuang s[6] = _mm_packs_epi32(v[4], v[5]); 244291037db265ecdd914a26e056cf69207b4f50924ehkuang s[7] = _mm_packs_epi32(v[6], v[7]); 244391037db265ecdd914a26e056cf69207b4f50924ehkuang s[8] = _mm_add_epi16(x[8], x[10]); 244491037db265ecdd914a26e056cf69207b4f50924ehkuang s[9] = _mm_add_epi16(x[9], x[11]); 244591037db265ecdd914a26e056cf69207b4f50924ehkuang s[10] = _mm_sub_epi16(x[8], x[10]); 244691037db265ecdd914a26e056cf69207b4f50924ehkuang s[11] = _mm_sub_epi16(x[9], x[11]); 244791037db265ecdd914a26e056cf69207b4f50924ehkuang s[12] = _mm_packs_epi32(v[8], v[9]); 244891037db265ecdd914a26e056cf69207b4f50924ehkuang s[13] = _mm_packs_epi32(v[10], v[11]); 244991037db265ecdd914a26e056cf69207b4f50924ehkuang s[14] = _mm_packs_epi32(v[12], v[13]); 245091037db265ecdd914a26e056cf69207b4f50924ehkuang s[15] = _mm_packs_epi32(v[14], v[15]); 245191037db265ecdd914a26e056cf69207b4f50924ehkuang 245291037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 4 245391037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_unpacklo_epi16(s[2], s[3]); 245491037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_unpackhi_epi16(s[2], s[3]); 245591037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_unpacklo_epi16(s[6], s[7]); 245691037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_unpackhi_epi16(s[6], s[7]); 245791037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_unpacklo_epi16(s[10], s[11]); 245891037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_unpackhi_epi16(s[10], s[11]); 245991037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_unpacklo_epi16(s[14], s[15]); 246091037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_unpackhi_epi16(s[14], s[15]); 246191037db265ecdd914a26e056cf69207b4f50924ehkuang 246291037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); 246391037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); 246491037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 246591037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); 246691037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 246791037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 246891037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 246991037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 247091037db265ecdd914a26e056cf69207b4f50924ehkuang v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); 247191037db265ecdd914a26e056cf69207b4f50924ehkuang v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); 247291037db265ecdd914a26e056cf69207b4f50924ehkuang v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); 247391037db265ecdd914a26e056cf69207b4f50924ehkuang v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); 247491037db265ecdd914a26e056cf69207b4f50924ehkuang v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); 247591037db265ecdd914a26e056cf69207b4f50924ehkuang v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); 247691037db265ecdd914a26e056cf69207b4f50924ehkuang v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); 247791037db265ecdd914a26e056cf69207b4f50924ehkuang v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); 247891037db265ecdd914a26e056cf69207b4f50924ehkuang 247991037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 248091037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 248191037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 248291037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 248391037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 248491037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 248591037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 248691037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 248791037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 248891037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 248991037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 249091037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 249191037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 249291037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 249391037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 249491037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 249591037db265ecdd914a26e056cf69207b4f50924ehkuang 249691037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 249791037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 249891037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 249991037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 250091037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 250191037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 250291037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 250391037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 250491037db265ecdd914a26e056cf69207b4f50924ehkuang v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 250591037db265ecdd914a26e056cf69207b4f50924ehkuang v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 250691037db265ecdd914a26e056cf69207b4f50924ehkuang v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 250791037db265ecdd914a26e056cf69207b4f50924ehkuang v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 250891037db265ecdd914a26e056cf69207b4f50924ehkuang v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 250991037db265ecdd914a26e056cf69207b4f50924ehkuang v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 251091037db265ecdd914a26e056cf69207b4f50924ehkuang v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 251191037db265ecdd914a26e056cf69207b4f50924ehkuang v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 251291037db265ecdd914a26e056cf69207b4f50924ehkuang 251391037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = s[0]; 251491037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_sub_epi16(kZero, s[8]); 251591037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = s[12]; 251691037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_sub_epi16(kZero, s[4]); 251791037db265ecdd914a26e056cf69207b4f50924ehkuang in[4] = _mm_packs_epi32(v[4], v[5]); 251891037db265ecdd914a26e056cf69207b4f50924ehkuang in[5] = _mm_packs_epi32(v[12], v[13]); 251991037db265ecdd914a26e056cf69207b4f50924ehkuang in[6] = _mm_packs_epi32(v[8], v[9]); 252091037db265ecdd914a26e056cf69207b4f50924ehkuang in[7] = _mm_packs_epi32(v[0], v[1]); 252191037db265ecdd914a26e056cf69207b4f50924ehkuang in[8] = _mm_packs_epi32(v[2], v[3]); 252291037db265ecdd914a26e056cf69207b4f50924ehkuang in[9] = _mm_packs_epi32(v[10], v[11]); 252391037db265ecdd914a26e056cf69207b4f50924ehkuang in[10] = _mm_packs_epi32(v[14], v[15]); 252491037db265ecdd914a26e056cf69207b4f50924ehkuang in[11] = _mm_packs_epi32(v[6], v[7]); 252591037db265ecdd914a26e056cf69207b4f50924ehkuang in[12] = s[5]; 252691037db265ecdd914a26e056cf69207b4f50924ehkuang in[13] = _mm_sub_epi16(kZero, s[13]); 252791037db265ecdd914a26e056cf69207b4f50924ehkuang in[14] = s[9]; 252891037db265ecdd914a26e056cf69207b4f50924ehkuang in[15] = _mm_sub_epi16(kZero, s[1]); 252991037db265ecdd914a26e056cf69207b4f50924ehkuang} 253091037db265ecdd914a26e056cf69207b4f50924ehkuang 253191037db265ecdd914a26e056cf69207b4f50924ehkuangvoid fdct16_1d_sse2(__m128i *in0, __m128i *in1) { 253291037db265ecdd914a26e056cf69207b4f50924ehkuang fdct16_1d_8col(in0); 253391037db265ecdd914a26e056cf69207b4f50924ehkuang fdct16_1d_8col(in1); 253491037db265ecdd914a26e056cf69207b4f50924ehkuang array_transpose_16x16(in0, in1); 253591037db265ecdd914a26e056cf69207b4f50924ehkuang} 253691037db265ecdd914a26e056cf69207b4f50924ehkuang 253791037db265ecdd914a26e056cf69207b4f50924ehkuangvoid fadst16_1d_sse2(__m128i *in0, __m128i *in1) { 253891037db265ecdd914a26e056cf69207b4f50924ehkuang fadst16_1d_8col(in0); 253991037db265ecdd914a26e056cf69207b4f50924ehkuang fadst16_1d_8col(in1); 254091037db265ecdd914a26e056cf69207b4f50924ehkuang array_transpose_16x16(in0, in1); 254191037db265ecdd914a26e056cf69207b4f50924ehkuang} 254291037db265ecdd914a26e056cf69207b4f50924ehkuang 254391037db265ecdd914a26e056cf69207b4f50924ehkuangvoid vp9_short_fht16x16_sse2(int16_t *input, int16_t *output, 254491037db265ecdd914a26e056cf69207b4f50924ehkuang int stride, int tx_type) { 254591037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in0[16], in1[16]; 254691037db265ecdd914a26e056cf69207b4f50924ehkuang load_buffer_16x16(input, in0, in1, stride); 254791037db265ecdd914a26e056cf69207b4f50924ehkuang switch (tx_type) { 254891037db265ecdd914a26e056cf69207b4f50924ehkuang case 0: // DCT_DCT 254991037db265ecdd914a26e056cf69207b4f50924ehkuang fdct16_1d_sse2(in0, in1); 255091037db265ecdd914a26e056cf69207b4f50924ehkuang right_shift_16x16(in0, in1); 255191037db265ecdd914a26e056cf69207b4f50924ehkuang fdct16_1d_sse2(in0, in1); 255291037db265ecdd914a26e056cf69207b4f50924ehkuang break; 255391037db265ecdd914a26e056cf69207b4f50924ehkuang case 1: // ADST_DCT 255491037db265ecdd914a26e056cf69207b4f50924ehkuang fadst16_1d_sse2(in0, in1); 255591037db265ecdd914a26e056cf69207b4f50924ehkuang right_shift_16x16(in0, in1); 255691037db265ecdd914a26e056cf69207b4f50924ehkuang fdct16_1d_sse2(in0, in1); 255791037db265ecdd914a26e056cf69207b4f50924ehkuang break; 255891037db265ecdd914a26e056cf69207b4f50924ehkuang case 2: // DCT_ADST 255991037db265ecdd914a26e056cf69207b4f50924ehkuang fdct16_1d_sse2(in0, in1); 256091037db265ecdd914a26e056cf69207b4f50924ehkuang right_shift_16x16(in0, in1); 256191037db265ecdd914a26e056cf69207b4f50924ehkuang fadst16_1d_sse2(in0, in1); 256291037db265ecdd914a26e056cf69207b4f50924ehkuang break; 256391037db265ecdd914a26e056cf69207b4f50924ehkuang case 3: // ADST_ADST 256491037db265ecdd914a26e056cf69207b4f50924ehkuang fadst16_1d_sse2(in0, in1); 256591037db265ecdd914a26e056cf69207b4f50924ehkuang right_shift_16x16(in0, in1); 256691037db265ecdd914a26e056cf69207b4f50924ehkuang fadst16_1d_sse2(in0, in1); 256791037db265ecdd914a26e056cf69207b4f50924ehkuang break; 256891037db265ecdd914a26e056cf69207b4f50924ehkuang default: 256991037db265ecdd914a26e056cf69207b4f50924ehkuang assert(0); 257091037db265ecdd914a26e056cf69207b4f50924ehkuang break; 257191037db265ecdd914a26e056cf69207b4f50924ehkuang } 257291037db265ecdd914a26e056cf69207b4f50924ehkuang write_buffer_16x16(output, in0, in1, 16); 257391037db265ecdd914a26e056cf69207b4f50924ehkuang} 257491037db265ecdd914a26e056cf69207b4f50924ehkuang 257591037db265ecdd914a26e056cf69207b4f50924ehkuangvoid vp9_short_fdct32x32_rd_sse2(int16_t *input, 257691037db265ecdd914a26e056cf69207b4f50924ehkuang int16_t *output_org, int pitch) { 257791037db265ecdd914a26e056cf69207b4f50924ehkuang // Calculate pre-multiplied strides 257891037db265ecdd914a26e056cf69207b4f50924ehkuang const int str1 = pitch >> 1; 257991037db265ecdd914a26e056cf69207b4f50924ehkuang const int str2 = pitch; 258091037db265ecdd914a26e056cf69207b4f50924ehkuang const int str3 = pitch + str1; 258191037db265ecdd914a26e056cf69207b4f50924ehkuang // We need an intermediate buffer between passes. 258291037db265ecdd914a26e056cf69207b4f50924ehkuang DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]); 258391037db265ecdd914a26e056cf69207b4f50924ehkuang // Constants 258491037db265ecdd914a26e056cf69207b4f50924ehkuang // When we use them, in one case, they are all the same. In all others 258591037db265ecdd914a26e056cf69207b4f50924ehkuang // it's a pair of them that we need to repeat four times. This is done 258691037db265ecdd914a26e056cf69207b4f50924ehkuang // by constructing the 32 bit constant corresponding to that pair. 258791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p16_p16 = _mm_set1_epi16(+cospi_16_64); 258891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64); 258991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 259091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 259191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64); 259291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64); 259391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 259491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 259591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64); 259691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64); 259791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64); 259891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64); 259991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64); 260091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64); 260191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64); 260291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); 260391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); 260491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); 260591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); 260691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64); 260791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64); 260891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64); 260991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64); 261091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64); 261191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64); 261291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64); 261391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64); 261491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64); 261591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64); 261691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64); 261791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64); 261891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64); 261991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64); 262091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64); 262191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64); 262291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 262391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i kZero = _mm_set1_epi16(0); 262491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i kOne = _mm_set1_epi16(1); 262591037db265ecdd914a26e056cf69207b4f50924ehkuang // Do the two transform/transpose passes 262691037db265ecdd914a26e056cf69207b4f50924ehkuang int pass; 262791037db265ecdd914a26e056cf69207b4f50924ehkuang for (pass = 0; pass < 2; ++pass) { 262891037db265ecdd914a26e056cf69207b4f50924ehkuang // We process eight columns (transposed rows in second pass) at a time. 262991037db265ecdd914a26e056cf69207b4f50924ehkuang int column_start; 263091037db265ecdd914a26e056cf69207b4f50924ehkuang for (column_start = 0; column_start < 32; column_start += 8) { 263191037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i step1[32]; 263291037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i step2[32]; 263391037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i step3[32]; 263491037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i out[32]; 263591037db265ecdd914a26e056cf69207b4f50924ehkuang // Stage 1 263691037db265ecdd914a26e056cf69207b4f50924ehkuang // Note: even though all the loads below are aligned, using the aligned 263791037db265ecdd914a26e056cf69207b4f50924ehkuang // intrinsic make the code slightly slower. 263891037db265ecdd914a26e056cf69207b4f50924ehkuang if (0 == pass) { 263991037db265ecdd914a26e056cf69207b4f50924ehkuang int16_t *in = &input[column_start]; 264091037db265ecdd914a26e056cf69207b4f50924ehkuang // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2; 264191037db265ecdd914a26e056cf69207b4f50924ehkuang // Note: the next four blocks could be in a loop. That would help the 264291037db265ecdd914a26e056cf69207b4f50924ehkuang // instruction cache but is actually slower. 264391037db265ecdd914a26e056cf69207b4f50924ehkuang { 264491037db265ecdd914a26e056cf69207b4f50924ehkuang int16_t *ina = in + 0 * str1; 264591037db265ecdd914a26e056cf69207b4f50924ehkuang int16_t *inb = in + 31 * str1; 264691037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i *step1a = &step1[ 0]; 264791037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i *step1b = &step1[31]; 264891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); 264991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); 265091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); 265191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); 265291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); 265391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); 265491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); 265591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); 265691037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 0] = _mm_add_epi16(ina0, inb0); 265791037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 1] = _mm_add_epi16(ina1, inb1); 265891037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 2] = _mm_add_epi16(ina2, inb2); 265991037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 3] = _mm_add_epi16(ina3, inb3); 266091037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-3] = _mm_sub_epi16(ina3, inb3); 266191037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-2] = _mm_sub_epi16(ina2, inb2); 266291037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-1] = _mm_sub_epi16(ina1, inb1); 266391037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-0] = _mm_sub_epi16(ina0, inb0); 266491037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); 266591037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); 266691037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); 266791037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); 266891037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-3] = _mm_slli_epi16(step1b[-3], 2); 266991037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-2] = _mm_slli_epi16(step1b[-2], 2); 267091037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-1] = _mm_slli_epi16(step1b[-1], 2); 267191037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-0] = _mm_slli_epi16(step1b[-0], 2); 267291037db265ecdd914a26e056cf69207b4f50924ehkuang } 267391037db265ecdd914a26e056cf69207b4f50924ehkuang { 267491037db265ecdd914a26e056cf69207b4f50924ehkuang int16_t *ina = in + 4 * str1; 267591037db265ecdd914a26e056cf69207b4f50924ehkuang int16_t *inb = in + 27 * str1; 267691037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i *step1a = &step1[ 4]; 267791037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i *step1b = &step1[27]; 267891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); 267991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); 268091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); 268191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); 268291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); 268391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); 268491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); 268591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); 268691037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 0] = _mm_add_epi16(ina0, inb0); 268791037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 1] = _mm_add_epi16(ina1, inb1); 268891037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 2] = _mm_add_epi16(ina2, inb2); 268991037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 3] = _mm_add_epi16(ina3, inb3); 269091037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-3] = _mm_sub_epi16(ina3, inb3); 269191037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-2] = _mm_sub_epi16(ina2, inb2); 269291037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-1] = _mm_sub_epi16(ina1, inb1); 269391037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-0] = _mm_sub_epi16(ina0, inb0); 269491037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); 269591037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); 269691037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); 269791037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); 269891037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-3] = _mm_slli_epi16(step1b[-3], 2); 269991037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-2] = _mm_slli_epi16(step1b[-2], 2); 270091037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-1] = _mm_slli_epi16(step1b[-1], 2); 270191037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-0] = _mm_slli_epi16(step1b[-0], 2); 270291037db265ecdd914a26e056cf69207b4f50924ehkuang } 270391037db265ecdd914a26e056cf69207b4f50924ehkuang { 270491037db265ecdd914a26e056cf69207b4f50924ehkuang int16_t *ina = in + 8 * str1; 270591037db265ecdd914a26e056cf69207b4f50924ehkuang int16_t *inb = in + 23 * str1; 270691037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i *step1a = &step1[ 8]; 270791037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i *step1b = &step1[23]; 270891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); 270991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); 271091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); 271191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); 271291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); 271391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); 271491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); 271591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); 271691037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 0] = _mm_add_epi16(ina0, inb0); 271791037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 1] = _mm_add_epi16(ina1, inb1); 271891037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 2] = _mm_add_epi16(ina2, inb2); 271991037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 3] = _mm_add_epi16(ina3, inb3); 272091037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-3] = _mm_sub_epi16(ina3, inb3); 272191037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-2] = _mm_sub_epi16(ina2, inb2); 272291037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-1] = _mm_sub_epi16(ina1, inb1); 272391037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-0] = _mm_sub_epi16(ina0, inb0); 272491037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); 272591037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); 272691037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); 272791037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); 272891037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-3] = _mm_slli_epi16(step1b[-3], 2); 272991037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-2] = _mm_slli_epi16(step1b[-2], 2); 273091037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-1] = _mm_slli_epi16(step1b[-1], 2); 273191037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-0] = _mm_slli_epi16(step1b[-0], 2); 273291037db265ecdd914a26e056cf69207b4f50924ehkuang } 273391037db265ecdd914a26e056cf69207b4f50924ehkuang { 273491037db265ecdd914a26e056cf69207b4f50924ehkuang int16_t *ina = in + 12 * str1; 273591037db265ecdd914a26e056cf69207b4f50924ehkuang int16_t *inb = in + 19 * str1; 273691037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i *step1a = &step1[12]; 273791037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i *step1b = &step1[19]; 273891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); 273991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); 274091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); 274191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); 274291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); 274391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); 274491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); 274591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); 274691037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 0] = _mm_add_epi16(ina0, inb0); 274791037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 1] = _mm_add_epi16(ina1, inb1); 274891037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 2] = _mm_add_epi16(ina2, inb2); 274991037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 3] = _mm_add_epi16(ina3, inb3); 275091037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-3] = _mm_sub_epi16(ina3, inb3); 275191037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-2] = _mm_sub_epi16(ina2, inb2); 275291037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-1] = _mm_sub_epi16(ina1, inb1); 275391037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-0] = _mm_sub_epi16(ina0, inb0); 275491037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); 275591037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); 275691037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); 275791037db265ecdd914a26e056cf69207b4f50924ehkuang step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); 275891037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-3] = _mm_slli_epi16(step1b[-3], 2); 275991037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-2] = _mm_slli_epi16(step1b[-2], 2); 276091037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-1] = _mm_slli_epi16(step1b[-1], 2); 276191037db265ecdd914a26e056cf69207b4f50924ehkuang step1b[-0] = _mm_slli_epi16(step1b[-0], 2); 276291037db265ecdd914a26e056cf69207b4f50924ehkuang } 276391037db265ecdd914a26e056cf69207b4f50924ehkuang } else { 276491037db265ecdd914a26e056cf69207b4f50924ehkuang int16_t *in = &intermediate[column_start]; 276591037db265ecdd914a26e056cf69207b4f50924ehkuang // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32]; 276691037db265ecdd914a26e056cf69207b4f50924ehkuang // Note: using the same approach as above to have common offset is 276791037db265ecdd914a26e056cf69207b4f50924ehkuang // counter-productive as all offsets can be calculated at compile 276891037db265ecdd914a26e056cf69207b4f50924ehkuang // time. 276991037db265ecdd914a26e056cf69207b4f50924ehkuang // Note: the next four blocks could be in a loop. That would help the 277091037db265ecdd914a26e056cf69207b4f50924ehkuang // instruction cache but is actually slower. 277191037db265ecdd914a26e056cf69207b4f50924ehkuang { 277291037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32)); 277391037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32)); 277491037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32)); 277591037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32)); 277691037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32)); 277791037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32)); 277891037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32)); 277991037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32)); 278091037db265ecdd914a26e056cf69207b4f50924ehkuang step1[ 0] = _mm_add_epi16(in00, in31); 278191037db265ecdd914a26e056cf69207b4f50924ehkuang step1[ 1] = _mm_add_epi16(in01, in30); 278291037db265ecdd914a26e056cf69207b4f50924ehkuang step1[ 2] = _mm_add_epi16(in02, in29); 278391037db265ecdd914a26e056cf69207b4f50924ehkuang step1[ 3] = _mm_add_epi16(in03, in28); 278491037db265ecdd914a26e056cf69207b4f50924ehkuang step1[28] = _mm_sub_epi16(in03, in28); 278591037db265ecdd914a26e056cf69207b4f50924ehkuang step1[29] = _mm_sub_epi16(in02, in29); 278691037db265ecdd914a26e056cf69207b4f50924ehkuang step1[30] = _mm_sub_epi16(in01, in30); 278791037db265ecdd914a26e056cf69207b4f50924ehkuang step1[31] = _mm_sub_epi16(in00, in31); 278891037db265ecdd914a26e056cf69207b4f50924ehkuang } 278991037db265ecdd914a26e056cf69207b4f50924ehkuang { 279091037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32)); 279191037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32)); 279291037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32)); 279391037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32)); 279491037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32)); 279591037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32)); 279691037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32)); 279791037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32)); 279891037db265ecdd914a26e056cf69207b4f50924ehkuang step1[ 4] = _mm_add_epi16(in04, in27); 279991037db265ecdd914a26e056cf69207b4f50924ehkuang step1[ 5] = _mm_add_epi16(in05, in26); 280091037db265ecdd914a26e056cf69207b4f50924ehkuang step1[ 6] = _mm_add_epi16(in06, in25); 280191037db265ecdd914a26e056cf69207b4f50924ehkuang step1[ 7] = _mm_add_epi16(in07, in24); 280291037db265ecdd914a26e056cf69207b4f50924ehkuang step1[24] = _mm_sub_epi16(in07, in24); 280391037db265ecdd914a26e056cf69207b4f50924ehkuang step1[25] = _mm_sub_epi16(in06, in25); 280491037db265ecdd914a26e056cf69207b4f50924ehkuang step1[26] = _mm_sub_epi16(in05, in26); 280591037db265ecdd914a26e056cf69207b4f50924ehkuang step1[27] = _mm_sub_epi16(in04, in27); 280691037db265ecdd914a26e056cf69207b4f50924ehkuang } 280791037db265ecdd914a26e056cf69207b4f50924ehkuang { 280891037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32)); 280991037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32)); 281091037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32)); 281191037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32)); 281291037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32)); 281391037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32)); 281491037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32)); 281591037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32)); 281691037db265ecdd914a26e056cf69207b4f50924ehkuang step1[ 8] = _mm_add_epi16(in08, in23); 281791037db265ecdd914a26e056cf69207b4f50924ehkuang step1[ 9] = _mm_add_epi16(in09, in22); 281891037db265ecdd914a26e056cf69207b4f50924ehkuang step1[10] = _mm_add_epi16(in10, in21); 281991037db265ecdd914a26e056cf69207b4f50924ehkuang step1[11] = _mm_add_epi16(in11, in20); 282091037db265ecdd914a26e056cf69207b4f50924ehkuang step1[20] = _mm_sub_epi16(in11, in20); 282191037db265ecdd914a26e056cf69207b4f50924ehkuang step1[21] = _mm_sub_epi16(in10, in21); 282291037db265ecdd914a26e056cf69207b4f50924ehkuang step1[22] = _mm_sub_epi16(in09, in22); 282391037db265ecdd914a26e056cf69207b4f50924ehkuang step1[23] = _mm_sub_epi16(in08, in23); 282491037db265ecdd914a26e056cf69207b4f50924ehkuang } 282591037db265ecdd914a26e056cf69207b4f50924ehkuang { 282691037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32)); 282791037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32)); 282891037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32)); 282991037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32)); 283091037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32)); 283191037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32)); 283291037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32)); 283391037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32)); 283491037db265ecdd914a26e056cf69207b4f50924ehkuang step1[12] = _mm_add_epi16(in12, in19); 283591037db265ecdd914a26e056cf69207b4f50924ehkuang step1[13] = _mm_add_epi16(in13, in18); 283691037db265ecdd914a26e056cf69207b4f50924ehkuang step1[14] = _mm_add_epi16(in14, in17); 283791037db265ecdd914a26e056cf69207b4f50924ehkuang step1[15] = _mm_add_epi16(in15, in16); 283891037db265ecdd914a26e056cf69207b4f50924ehkuang step1[16] = _mm_sub_epi16(in15, in16); 283991037db265ecdd914a26e056cf69207b4f50924ehkuang step1[17] = _mm_sub_epi16(in14, in17); 284091037db265ecdd914a26e056cf69207b4f50924ehkuang step1[18] = _mm_sub_epi16(in13, in18); 284191037db265ecdd914a26e056cf69207b4f50924ehkuang step1[19] = _mm_sub_epi16(in12, in19); 284291037db265ecdd914a26e056cf69207b4f50924ehkuang } 284391037db265ecdd914a26e056cf69207b4f50924ehkuang } 284491037db265ecdd914a26e056cf69207b4f50924ehkuang // Stage 2 284591037db265ecdd914a26e056cf69207b4f50924ehkuang { 284691037db265ecdd914a26e056cf69207b4f50924ehkuang step2[ 0] = _mm_add_epi16(step1[0], step1[15]); 284791037db265ecdd914a26e056cf69207b4f50924ehkuang step2[ 1] = _mm_add_epi16(step1[1], step1[14]); 284891037db265ecdd914a26e056cf69207b4f50924ehkuang step2[ 2] = _mm_add_epi16(step1[2], step1[13]); 284991037db265ecdd914a26e056cf69207b4f50924ehkuang step2[ 3] = _mm_add_epi16(step1[3], step1[12]); 285091037db265ecdd914a26e056cf69207b4f50924ehkuang step2[ 4] = _mm_add_epi16(step1[4], step1[11]); 285191037db265ecdd914a26e056cf69207b4f50924ehkuang step2[ 5] = _mm_add_epi16(step1[5], step1[10]); 285291037db265ecdd914a26e056cf69207b4f50924ehkuang step2[ 6] = _mm_add_epi16(step1[6], step1[ 9]); 285391037db265ecdd914a26e056cf69207b4f50924ehkuang step2[ 7] = _mm_add_epi16(step1[7], step1[ 8]); 285491037db265ecdd914a26e056cf69207b4f50924ehkuang step2[ 8] = _mm_sub_epi16(step1[7], step1[ 8]); 285591037db265ecdd914a26e056cf69207b4f50924ehkuang step2[ 9] = _mm_sub_epi16(step1[6], step1[ 9]); 285691037db265ecdd914a26e056cf69207b4f50924ehkuang step2[10] = _mm_sub_epi16(step1[5], step1[10]); 285791037db265ecdd914a26e056cf69207b4f50924ehkuang step2[11] = _mm_sub_epi16(step1[4], step1[11]); 285891037db265ecdd914a26e056cf69207b4f50924ehkuang step2[12] = _mm_sub_epi16(step1[3], step1[12]); 285991037db265ecdd914a26e056cf69207b4f50924ehkuang step2[13] = _mm_sub_epi16(step1[2], step1[13]); 286091037db265ecdd914a26e056cf69207b4f50924ehkuang step2[14] = _mm_sub_epi16(step1[1], step1[14]); 286191037db265ecdd914a26e056cf69207b4f50924ehkuang step2[15] = _mm_sub_epi16(step1[0], step1[15]); 286291037db265ecdd914a26e056cf69207b4f50924ehkuang } 286391037db265ecdd914a26e056cf69207b4f50924ehkuang { 286491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]); 286591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]); 286691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]); 286791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]); 286891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]); 286991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]); 287091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]); 287191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]); 287291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16); 287391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16); 287491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16); 287591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16); 287691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16); 287791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16); 287891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16); 287991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16); 288091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16); 288191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16); 288291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16); 288391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16); 288491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16); 288591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16); 288691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16); 288791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16); 288891037db265ecdd914a26e056cf69207b4f50924ehkuang // dct_const_round_shift 288991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING); 289091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING); 289191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING); 289291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING); 289391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING); 289491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING); 289591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING); 289691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING); 289791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING); 289891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING); 289991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING); 290091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING); 290191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING); 290291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING); 290391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING); 290491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING); 290591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS); 290691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS); 290791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS); 290891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS); 290991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS); 291091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS); 291191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS); 291291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS); 291391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS); 291491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS); 291591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS); 291691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS); 291791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS); 291891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS); 291991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS); 292091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS); 292191037db265ecdd914a26e056cf69207b4f50924ehkuang // Combine 292291037db265ecdd914a26e056cf69207b4f50924ehkuang step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7); 292391037db265ecdd914a26e056cf69207b4f50924ehkuang step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7); 292491037db265ecdd914a26e056cf69207b4f50924ehkuang step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7); 292591037db265ecdd914a26e056cf69207b4f50924ehkuang step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7); 292691037db265ecdd914a26e056cf69207b4f50924ehkuang step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7); 292791037db265ecdd914a26e056cf69207b4f50924ehkuang step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7); 292891037db265ecdd914a26e056cf69207b4f50924ehkuang step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7); 292991037db265ecdd914a26e056cf69207b4f50924ehkuang step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7); 293091037db265ecdd914a26e056cf69207b4f50924ehkuang } 293191037db265ecdd914a26e056cf69207b4f50924ehkuang // Stage 3 293291037db265ecdd914a26e056cf69207b4f50924ehkuang { 293391037db265ecdd914a26e056cf69207b4f50924ehkuang step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]); 293491037db265ecdd914a26e056cf69207b4f50924ehkuang step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]); 293591037db265ecdd914a26e056cf69207b4f50924ehkuang step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]); 293691037db265ecdd914a26e056cf69207b4f50924ehkuang step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]); 293791037db265ecdd914a26e056cf69207b4f50924ehkuang step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]); 293891037db265ecdd914a26e056cf69207b4f50924ehkuang step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]); 293991037db265ecdd914a26e056cf69207b4f50924ehkuang step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]); 294091037db265ecdd914a26e056cf69207b4f50924ehkuang step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]); 294191037db265ecdd914a26e056cf69207b4f50924ehkuang } 294291037db265ecdd914a26e056cf69207b4f50924ehkuang { 294391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]); 294491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]); 294591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]); 294691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]); 294791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16); 294891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16); 294991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16); 295091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16); 295191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16); 295291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16); 295391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16); 295491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16); 295591037db265ecdd914a26e056cf69207b4f50924ehkuang // dct_const_round_shift 295691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); 295791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); 295891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); 295991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); 296091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); 296191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); 296291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); 296391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); 296491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS); 296591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS); 296691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS); 296791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS); 296891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS); 296991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS); 297091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS); 297191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); 297291037db265ecdd914a26e056cf69207b4f50924ehkuang // Combine 297391037db265ecdd914a26e056cf69207b4f50924ehkuang step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7); 297491037db265ecdd914a26e056cf69207b4f50924ehkuang step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7); 297591037db265ecdd914a26e056cf69207b4f50924ehkuang step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7); 297691037db265ecdd914a26e056cf69207b4f50924ehkuang step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7); 297791037db265ecdd914a26e056cf69207b4f50924ehkuang } 297891037db265ecdd914a26e056cf69207b4f50924ehkuang { 297991037db265ecdd914a26e056cf69207b4f50924ehkuang step3[16] = _mm_add_epi16(step2[23], step1[16]); 298091037db265ecdd914a26e056cf69207b4f50924ehkuang step3[17] = _mm_add_epi16(step2[22], step1[17]); 298191037db265ecdd914a26e056cf69207b4f50924ehkuang step3[18] = _mm_add_epi16(step2[21], step1[18]); 298291037db265ecdd914a26e056cf69207b4f50924ehkuang step3[19] = _mm_add_epi16(step2[20], step1[19]); 298391037db265ecdd914a26e056cf69207b4f50924ehkuang step3[20] = _mm_sub_epi16(step1[19], step2[20]); 298491037db265ecdd914a26e056cf69207b4f50924ehkuang step3[21] = _mm_sub_epi16(step1[18], step2[21]); 298591037db265ecdd914a26e056cf69207b4f50924ehkuang step3[22] = _mm_sub_epi16(step1[17], step2[22]); 298691037db265ecdd914a26e056cf69207b4f50924ehkuang step3[23] = _mm_sub_epi16(step1[16], step2[23]); 298791037db265ecdd914a26e056cf69207b4f50924ehkuang step3[24] = _mm_sub_epi16(step1[31], step2[24]); 298891037db265ecdd914a26e056cf69207b4f50924ehkuang step3[25] = _mm_sub_epi16(step1[30], step2[25]); 298991037db265ecdd914a26e056cf69207b4f50924ehkuang step3[26] = _mm_sub_epi16(step1[29], step2[26]); 299091037db265ecdd914a26e056cf69207b4f50924ehkuang step3[27] = _mm_sub_epi16(step1[28], step2[27]); 299191037db265ecdd914a26e056cf69207b4f50924ehkuang step3[28] = _mm_add_epi16(step2[27], step1[28]); 299291037db265ecdd914a26e056cf69207b4f50924ehkuang step3[29] = _mm_add_epi16(step2[26], step1[29]); 299391037db265ecdd914a26e056cf69207b4f50924ehkuang step3[30] = _mm_add_epi16(step2[25], step1[30]); 299491037db265ecdd914a26e056cf69207b4f50924ehkuang step3[31] = _mm_add_epi16(step2[24], step1[31]); 299591037db265ecdd914a26e056cf69207b4f50924ehkuang } 299691037db265ecdd914a26e056cf69207b4f50924ehkuang // dump the magnitude by half, hence the intermediate values are within 299791037db265ecdd914a26e056cf69207b4f50924ehkuang // the range of 16 bits. 299891037db265ecdd914a26e056cf69207b4f50924ehkuang if (1 == pass) { 299991037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s3_00_0 = _mm_cmplt_epi16(step3[ 0], kZero); 300091037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s3_01_0 = _mm_cmplt_epi16(step3[ 1], kZero); 300191037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s3_02_0 = _mm_cmplt_epi16(step3[ 2], kZero); 300291037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s3_03_0 = _mm_cmplt_epi16(step3[ 3], kZero); 300391037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s3_04_0 = _mm_cmplt_epi16(step3[ 4], kZero); 300491037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s3_05_0 = _mm_cmplt_epi16(step3[ 5], kZero); 300591037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s3_06_0 = _mm_cmplt_epi16(step3[ 6], kZero); 300691037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s3_07_0 = _mm_cmplt_epi16(step3[ 7], kZero); 300791037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero); 300891037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero); 300991037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s3_10_0 = _mm_cmplt_epi16(step3[10], kZero); 301091037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s3_11_0 = _mm_cmplt_epi16(step3[11], kZero); 301191037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s3_12_0 = _mm_cmplt_epi16(step3[12], kZero); 301291037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s3_13_0 = _mm_cmplt_epi16(step3[13], kZero); 301391037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero); 301491037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero); 301591037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s3_16_0 = _mm_cmplt_epi16(step3[16], kZero); 301691037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s3_17_0 = _mm_cmplt_epi16(step3[17], kZero); 301791037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s3_18_0 = _mm_cmplt_epi16(step3[18], kZero); 301891037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s3_19_0 = _mm_cmplt_epi16(step3[19], kZero); 301991037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s3_20_0 = _mm_cmplt_epi16(step3[20], kZero); 302091037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s3_21_0 = _mm_cmplt_epi16(step3[21], kZero); 302191037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s3_22_0 = _mm_cmplt_epi16(step3[22], kZero); 302291037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s3_23_0 = _mm_cmplt_epi16(step3[23], kZero); 302391037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s3_24_0 = _mm_cmplt_epi16(step3[24], kZero); 302491037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s3_25_0 = _mm_cmplt_epi16(step3[25], kZero); 302591037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s3_26_0 = _mm_cmplt_epi16(step3[26], kZero); 302691037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s3_27_0 = _mm_cmplt_epi16(step3[27], kZero); 302791037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s3_28_0 = _mm_cmplt_epi16(step3[28], kZero); 302891037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s3_29_0 = _mm_cmplt_epi16(step3[29], kZero); 302991037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s3_30_0 = _mm_cmplt_epi16(step3[30], kZero); 303091037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s3_31_0 = _mm_cmplt_epi16(step3[31], kZero); 303191037db265ecdd914a26e056cf69207b4f50924ehkuang step3[ 0] = _mm_sub_epi16(step3[ 0], s3_00_0); 303291037db265ecdd914a26e056cf69207b4f50924ehkuang step3[ 1] = _mm_sub_epi16(step3[ 1], s3_01_0); 303391037db265ecdd914a26e056cf69207b4f50924ehkuang step3[ 2] = _mm_sub_epi16(step3[ 2], s3_02_0); 303491037db265ecdd914a26e056cf69207b4f50924ehkuang step3[ 3] = _mm_sub_epi16(step3[ 3], s3_03_0); 303591037db265ecdd914a26e056cf69207b4f50924ehkuang step3[ 4] = _mm_sub_epi16(step3[ 4], s3_04_0); 303691037db265ecdd914a26e056cf69207b4f50924ehkuang step3[ 5] = _mm_sub_epi16(step3[ 5], s3_05_0); 303791037db265ecdd914a26e056cf69207b4f50924ehkuang step3[ 6] = _mm_sub_epi16(step3[ 6], s3_06_0); 303891037db265ecdd914a26e056cf69207b4f50924ehkuang step3[ 7] = _mm_sub_epi16(step3[ 7], s3_07_0); 303991037db265ecdd914a26e056cf69207b4f50924ehkuang step2[ 8] = _mm_sub_epi16(step2[ 8], s2_08_0); 304091037db265ecdd914a26e056cf69207b4f50924ehkuang step2[ 9] = _mm_sub_epi16(step2[ 9], s2_09_0); 304191037db265ecdd914a26e056cf69207b4f50924ehkuang step3[10] = _mm_sub_epi16(step3[10], s3_10_0); 304291037db265ecdd914a26e056cf69207b4f50924ehkuang step3[11] = _mm_sub_epi16(step3[11], s3_11_0); 304391037db265ecdd914a26e056cf69207b4f50924ehkuang step3[12] = _mm_sub_epi16(step3[12], s3_12_0); 304491037db265ecdd914a26e056cf69207b4f50924ehkuang step3[13] = _mm_sub_epi16(step3[13], s3_13_0); 304591037db265ecdd914a26e056cf69207b4f50924ehkuang step2[14] = _mm_sub_epi16(step2[14], s2_14_0); 304691037db265ecdd914a26e056cf69207b4f50924ehkuang step2[15] = _mm_sub_epi16(step2[15], s2_15_0); 304791037db265ecdd914a26e056cf69207b4f50924ehkuang step3[16] = _mm_sub_epi16(step3[16], s3_16_0); 304891037db265ecdd914a26e056cf69207b4f50924ehkuang step3[17] = _mm_sub_epi16(step3[17], s3_17_0); 304991037db265ecdd914a26e056cf69207b4f50924ehkuang step3[18] = _mm_sub_epi16(step3[18], s3_18_0); 305091037db265ecdd914a26e056cf69207b4f50924ehkuang step3[19] = _mm_sub_epi16(step3[19], s3_19_0); 305191037db265ecdd914a26e056cf69207b4f50924ehkuang step3[20] = _mm_sub_epi16(step3[20], s3_20_0); 305291037db265ecdd914a26e056cf69207b4f50924ehkuang step3[21] = _mm_sub_epi16(step3[21], s3_21_0); 305391037db265ecdd914a26e056cf69207b4f50924ehkuang step3[22] = _mm_sub_epi16(step3[22], s3_22_0); 305491037db265ecdd914a26e056cf69207b4f50924ehkuang step3[23] = _mm_sub_epi16(step3[23], s3_23_0); 305591037db265ecdd914a26e056cf69207b4f50924ehkuang step3[24] = _mm_sub_epi16(step3[24], s3_24_0); 305691037db265ecdd914a26e056cf69207b4f50924ehkuang step3[25] = _mm_sub_epi16(step3[25], s3_25_0); 305791037db265ecdd914a26e056cf69207b4f50924ehkuang step3[26] = _mm_sub_epi16(step3[26], s3_26_0); 305891037db265ecdd914a26e056cf69207b4f50924ehkuang step3[27] = _mm_sub_epi16(step3[27], s3_27_0); 305991037db265ecdd914a26e056cf69207b4f50924ehkuang step3[28] = _mm_sub_epi16(step3[28], s3_28_0); 306091037db265ecdd914a26e056cf69207b4f50924ehkuang step3[29] = _mm_sub_epi16(step3[29], s3_29_0); 306191037db265ecdd914a26e056cf69207b4f50924ehkuang step3[30] = _mm_sub_epi16(step3[30], s3_30_0); 306291037db265ecdd914a26e056cf69207b4f50924ehkuang step3[31] = _mm_sub_epi16(step3[31], s3_31_0); 306391037db265ecdd914a26e056cf69207b4f50924ehkuang step3[ 0] = _mm_add_epi16(step3[ 0], kOne); 306491037db265ecdd914a26e056cf69207b4f50924ehkuang step3[ 1] = _mm_add_epi16(step3[ 1], kOne); 306591037db265ecdd914a26e056cf69207b4f50924ehkuang step3[ 2] = _mm_add_epi16(step3[ 2], kOne); 306691037db265ecdd914a26e056cf69207b4f50924ehkuang step3[ 3] = _mm_add_epi16(step3[ 3], kOne); 306791037db265ecdd914a26e056cf69207b4f50924ehkuang step3[ 4] = _mm_add_epi16(step3[ 4], kOne); 306891037db265ecdd914a26e056cf69207b4f50924ehkuang step3[ 5] = _mm_add_epi16(step3[ 5], kOne); 306991037db265ecdd914a26e056cf69207b4f50924ehkuang step3[ 6] = _mm_add_epi16(step3[ 6], kOne); 307091037db265ecdd914a26e056cf69207b4f50924ehkuang step3[ 7] = _mm_add_epi16(step3[ 7], kOne); 307191037db265ecdd914a26e056cf69207b4f50924ehkuang step2[ 8] = _mm_add_epi16(step2[ 8], kOne); 307291037db265ecdd914a26e056cf69207b4f50924ehkuang step2[ 9] = _mm_add_epi16(step2[ 9], kOne); 307391037db265ecdd914a26e056cf69207b4f50924ehkuang step3[10] = _mm_add_epi16(step3[10], kOne); 307491037db265ecdd914a26e056cf69207b4f50924ehkuang step3[11] = _mm_add_epi16(step3[11], kOne); 307591037db265ecdd914a26e056cf69207b4f50924ehkuang step3[12] = _mm_add_epi16(step3[12], kOne); 307691037db265ecdd914a26e056cf69207b4f50924ehkuang step3[13] = _mm_add_epi16(step3[13], kOne); 307791037db265ecdd914a26e056cf69207b4f50924ehkuang step2[14] = _mm_add_epi16(step2[14], kOne); 307891037db265ecdd914a26e056cf69207b4f50924ehkuang step2[15] = _mm_add_epi16(step2[15], kOne); 307991037db265ecdd914a26e056cf69207b4f50924ehkuang step3[16] = _mm_add_epi16(step3[16], kOne); 308091037db265ecdd914a26e056cf69207b4f50924ehkuang step3[17] = _mm_add_epi16(step3[17], kOne); 308191037db265ecdd914a26e056cf69207b4f50924ehkuang step3[18] = _mm_add_epi16(step3[18], kOne); 308291037db265ecdd914a26e056cf69207b4f50924ehkuang step3[19] = _mm_add_epi16(step3[19], kOne); 308391037db265ecdd914a26e056cf69207b4f50924ehkuang step3[20] = _mm_add_epi16(step3[20], kOne); 308491037db265ecdd914a26e056cf69207b4f50924ehkuang step3[21] = _mm_add_epi16(step3[21], kOne); 308591037db265ecdd914a26e056cf69207b4f50924ehkuang step3[22] = _mm_add_epi16(step3[22], kOne); 308691037db265ecdd914a26e056cf69207b4f50924ehkuang step3[23] = _mm_add_epi16(step3[23], kOne); 308791037db265ecdd914a26e056cf69207b4f50924ehkuang step3[24] = _mm_add_epi16(step3[24], kOne); 308891037db265ecdd914a26e056cf69207b4f50924ehkuang step3[25] = _mm_add_epi16(step3[25], kOne); 308991037db265ecdd914a26e056cf69207b4f50924ehkuang step3[26] = _mm_add_epi16(step3[26], kOne); 309091037db265ecdd914a26e056cf69207b4f50924ehkuang step3[27] = _mm_add_epi16(step3[27], kOne); 309191037db265ecdd914a26e056cf69207b4f50924ehkuang step3[28] = _mm_add_epi16(step3[28], kOne); 309291037db265ecdd914a26e056cf69207b4f50924ehkuang step3[29] = _mm_add_epi16(step3[29], kOne); 309391037db265ecdd914a26e056cf69207b4f50924ehkuang step3[30] = _mm_add_epi16(step3[30], kOne); 309491037db265ecdd914a26e056cf69207b4f50924ehkuang step3[31] = _mm_add_epi16(step3[31], kOne); 309591037db265ecdd914a26e056cf69207b4f50924ehkuang step3[ 0] = _mm_srai_epi16(step3[ 0], 2); 309691037db265ecdd914a26e056cf69207b4f50924ehkuang step3[ 1] = _mm_srai_epi16(step3[ 1], 2); 309791037db265ecdd914a26e056cf69207b4f50924ehkuang step3[ 2] = _mm_srai_epi16(step3[ 2], 2); 309891037db265ecdd914a26e056cf69207b4f50924ehkuang step3[ 3] = _mm_srai_epi16(step3[ 3], 2); 309991037db265ecdd914a26e056cf69207b4f50924ehkuang step3[ 4] = _mm_srai_epi16(step3[ 4], 2); 310091037db265ecdd914a26e056cf69207b4f50924ehkuang step3[ 5] = _mm_srai_epi16(step3[ 5], 2); 310191037db265ecdd914a26e056cf69207b4f50924ehkuang step3[ 6] = _mm_srai_epi16(step3[ 6], 2); 310291037db265ecdd914a26e056cf69207b4f50924ehkuang step3[ 7] = _mm_srai_epi16(step3[ 7], 2); 310391037db265ecdd914a26e056cf69207b4f50924ehkuang step2[ 8] = _mm_srai_epi16(step2[ 8], 2); 310491037db265ecdd914a26e056cf69207b4f50924ehkuang step2[ 9] = _mm_srai_epi16(step2[ 9], 2); 310591037db265ecdd914a26e056cf69207b4f50924ehkuang step3[10] = _mm_srai_epi16(step3[10], 2); 310691037db265ecdd914a26e056cf69207b4f50924ehkuang step3[11] = _mm_srai_epi16(step3[11], 2); 310791037db265ecdd914a26e056cf69207b4f50924ehkuang step3[12] = _mm_srai_epi16(step3[12], 2); 310891037db265ecdd914a26e056cf69207b4f50924ehkuang step3[13] = _mm_srai_epi16(step3[13], 2); 310991037db265ecdd914a26e056cf69207b4f50924ehkuang step2[14] = _mm_srai_epi16(step2[14], 2); 311091037db265ecdd914a26e056cf69207b4f50924ehkuang step2[15] = _mm_srai_epi16(step2[15], 2); 311191037db265ecdd914a26e056cf69207b4f50924ehkuang step3[16] = _mm_srai_epi16(step3[16], 2); 311291037db265ecdd914a26e056cf69207b4f50924ehkuang step3[17] = _mm_srai_epi16(step3[17], 2); 311391037db265ecdd914a26e056cf69207b4f50924ehkuang step3[18] = _mm_srai_epi16(step3[18], 2); 311491037db265ecdd914a26e056cf69207b4f50924ehkuang step3[19] = _mm_srai_epi16(step3[19], 2); 311591037db265ecdd914a26e056cf69207b4f50924ehkuang step3[20] = _mm_srai_epi16(step3[20], 2); 311691037db265ecdd914a26e056cf69207b4f50924ehkuang step3[21] = _mm_srai_epi16(step3[21], 2); 311791037db265ecdd914a26e056cf69207b4f50924ehkuang step3[22] = _mm_srai_epi16(step3[22], 2); 311891037db265ecdd914a26e056cf69207b4f50924ehkuang step3[23] = _mm_srai_epi16(step3[23], 2); 311991037db265ecdd914a26e056cf69207b4f50924ehkuang step3[24] = _mm_srai_epi16(step3[24], 2); 312091037db265ecdd914a26e056cf69207b4f50924ehkuang step3[25] = _mm_srai_epi16(step3[25], 2); 312191037db265ecdd914a26e056cf69207b4f50924ehkuang step3[26] = _mm_srai_epi16(step3[26], 2); 312291037db265ecdd914a26e056cf69207b4f50924ehkuang step3[27] = _mm_srai_epi16(step3[27], 2); 312391037db265ecdd914a26e056cf69207b4f50924ehkuang step3[28] = _mm_srai_epi16(step3[28], 2); 312491037db265ecdd914a26e056cf69207b4f50924ehkuang step3[29] = _mm_srai_epi16(step3[29], 2); 312591037db265ecdd914a26e056cf69207b4f50924ehkuang step3[30] = _mm_srai_epi16(step3[30], 2); 312691037db265ecdd914a26e056cf69207b4f50924ehkuang step3[31] = _mm_srai_epi16(step3[31], 2); 312791037db265ecdd914a26e056cf69207b4f50924ehkuang } 312891037db265ecdd914a26e056cf69207b4f50924ehkuang // Stage 4 312991037db265ecdd914a26e056cf69207b4f50924ehkuang { 313091037db265ecdd914a26e056cf69207b4f50924ehkuang step1[ 0] = _mm_add_epi16(step3[ 3], step3[ 0]); 313191037db265ecdd914a26e056cf69207b4f50924ehkuang step1[ 1] = _mm_add_epi16(step3[ 2], step3[ 1]); 313291037db265ecdd914a26e056cf69207b4f50924ehkuang step1[ 2] = _mm_sub_epi16(step3[ 1], step3[ 2]); 313391037db265ecdd914a26e056cf69207b4f50924ehkuang step1[ 3] = _mm_sub_epi16(step3[ 0], step3[ 3]); 313491037db265ecdd914a26e056cf69207b4f50924ehkuang step1[ 8] = _mm_add_epi16(step3[11], step2[ 8]); 313591037db265ecdd914a26e056cf69207b4f50924ehkuang step1[ 9] = _mm_add_epi16(step3[10], step2[ 9]); 313691037db265ecdd914a26e056cf69207b4f50924ehkuang step1[10] = _mm_sub_epi16(step2[ 9], step3[10]); 313791037db265ecdd914a26e056cf69207b4f50924ehkuang step1[11] = _mm_sub_epi16(step2[ 8], step3[11]); 313891037db265ecdd914a26e056cf69207b4f50924ehkuang step1[12] = _mm_sub_epi16(step2[15], step3[12]); 313991037db265ecdd914a26e056cf69207b4f50924ehkuang step1[13] = _mm_sub_epi16(step2[14], step3[13]); 314091037db265ecdd914a26e056cf69207b4f50924ehkuang step1[14] = _mm_add_epi16(step3[13], step2[14]); 314191037db265ecdd914a26e056cf69207b4f50924ehkuang step1[15] = _mm_add_epi16(step3[12], step2[15]); 314291037db265ecdd914a26e056cf69207b4f50924ehkuang } 314391037db265ecdd914a26e056cf69207b4f50924ehkuang { 314491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]); 314591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]); 314691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16); 314791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16); 314891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16); 314991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16); 315091037db265ecdd914a26e056cf69207b4f50924ehkuang // dct_const_round_shift 315191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING); 315291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING); 315391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING); 315491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING); 315591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS); 315691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS); 315791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS); 315891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS); 315991037db265ecdd914a26e056cf69207b4f50924ehkuang // Combine 316091037db265ecdd914a26e056cf69207b4f50924ehkuang step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7); 316191037db265ecdd914a26e056cf69207b4f50924ehkuang step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7); 316291037db265ecdd914a26e056cf69207b4f50924ehkuang } 316391037db265ecdd914a26e056cf69207b4f50924ehkuang { 316491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]); 316591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]); 316691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]); 316791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]); 316891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]); 316991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]); 317091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]); 317191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]); 317291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24); 317391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24); 317491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24); 317591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24); 317691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08); 317791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08); 317891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08); 317991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08); 318091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24); 318191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24); 318291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24); 318391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24); 318491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08); 318591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08); 318691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08); 318791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08); 318891037db265ecdd914a26e056cf69207b4f50924ehkuang // dct_const_round_shift 318991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING); 319091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING); 319191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING); 319291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING); 319391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING); 319491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING); 319591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING); 319691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING); 319791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING); 319891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING); 319991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING); 320091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING); 320191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING); 320291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING); 320391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING); 320491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING); 320591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS); 320691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS); 320791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS); 320891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS); 320991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS); 321091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS); 321191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS); 321291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS); 321391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS); 321491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS); 321591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS); 321691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS); 321791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS); 321891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS); 321991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS); 322091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS); 322191037db265ecdd914a26e056cf69207b4f50924ehkuang // Combine 322291037db265ecdd914a26e056cf69207b4f50924ehkuang step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7); 322391037db265ecdd914a26e056cf69207b4f50924ehkuang step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7); 322491037db265ecdd914a26e056cf69207b4f50924ehkuang step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7); 322591037db265ecdd914a26e056cf69207b4f50924ehkuang step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7); 322691037db265ecdd914a26e056cf69207b4f50924ehkuang step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7); 322791037db265ecdd914a26e056cf69207b4f50924ehkuang step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7); 322891037db265ecdd914a26e056cf69207b4f50924ehkuang step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7); 322991037db265ecdd914a26e056cf69207b4f50924ehkuang step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7); 323091037db265ecdd914a26e056cf69207b4f50924ehkuang } 323191037db265ecdd914a26e056cf69207b4f50924ehkuang // Stage 5 323291037db265ecdd914a26e056cf69207b4f50924ehkuang { 323391037db265ecdd914a26e056cf69207b4f50924ehkuang step2[4] = _mm_add_epi16(step1[5], step3[4]); 323491037db265ecdd914a26e056cf69207b4f50924ehkuang step2[5] = _mm_sub_epi16(step3[4], step1[5]); 323591037db265ecdd914a26e056cf69207b4f50924ehkuang step2[6] = _mm_sub_epi16(step3[7], step1[6]); 323691037db265ecdd914a26e056cf69207b4f50924ehkuang step2[7] = _mm_add_epi16(step1[6], step3[7]); 323791037db265ecdd914a26e056cf69207b4f50924ehkuang } 323891037db265ecdd914a26e056cf69207b4f50924ehkuang { 323991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]); 324091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]); 324191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]); 324291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]); 324391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16); 324491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16); 324591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16); 324691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16); 324791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08); 324891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08); 324991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24); 325091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24); 325191037db265ecdd914a26e056cf69207b4f50924ehkuang // dct_const_round_shift 325291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING); 325391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING); 325491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING); 325591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING); 325691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING); 325791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING); 325891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING); 325991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING); 326091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS); 326191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS); 326291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS); 326391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS); 326491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS); 326591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS); 326691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS); 326791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS); 326891037db265ecdd914a26e056cf69207b4f50924ehkuang // Combine 326991037db265ecdd914a26e056cf69207b4f50924ehkuang out[ 0] = _mm_packs_epi32(out_00_6, out_00_7); 327091037db265ecdd914a26e056cf69207b4f50924ehkuang out[16] = _mm_packs_epi32(out_16_6, out_16_7); 327191037db265ecdd914a26e056cf69207b4f50924ehkuang out[ 8] = _mm_packs_epi32(out_08_6, out_08_7); 327291037db265ecdd914a26e056cf69207b4f50924ehkuang out[24] = _mm_packs_epi32(out_24_6, out_24_7); 327391037db265ecdd914a26e056cf69207b4f50924ehkuang } 327491037db265ecdd914a26e056cf69207b4f50924ehkuang { 327591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[ 9], step1[14]); 327691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[ 9], step1[14]); 327791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]); 327891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]); 327991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24); 328091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24); 328191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08); 328291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08); 328391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24); 328491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24); 328591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08); 328691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08); 328791037db265ecdd914a26e056cf69207b4f50924ehkuang // dct_const_round_shift 328891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING); 328991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING); 329091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING); 329191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING); 329291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING); 329391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING); 329491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING); 329591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING); 329691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS); 329791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS); 329891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS); 329991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS); 330091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS); 330191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS); 330291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS); 330391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS); 330491037db265ecdd914a26e056cf69207b4f50924ehkuang // Combine 330591037db265ecdd914a26e056cf69207b4f50924ehkuang step2[ 9] = _mm_packs_epi32(s2_09_6, s2_09_7); 330691037db265ecdd914a26e056cf69207b4f50924ehkuang step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7); 330791037db265ecdd914a26e056cf69207b4f50924ehkuang step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7); 330891037db265ecdd914a26e056cf69207b4f50924ehkuang step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7); 330991037db265ecdd914a26e056cf69207b4f50924ehkuang } 331091037db265ecdd914a26e056cf69207b4f50924ehkuang { 331191037db265ecdd914a26e056cf69207b4f50924ehkuang step2[16] = _mm_add_epi16(step1[19], step3[16]); 331291037db265ecdd914a26e056cf69207b4f50924ehkuang step2[17] = _mm_add_epi16(step1[18], step3[17]); 331391037db265ecdd914a26e056cf69207b4f50924ehkuang step2[18] = _mm_sub_epi16(step3[17], step1[18]); 331491037db265ecdd914a26e056cf69207b4f50924ehkuang step2[19] = _mm_sub_epi16(step3[16], step1[19]); 331591037db265ecdd914a26e056cf69207b4f50924ehkuang step2[20] = _mm_sub_epi16(step3[23], step1[20]); 331691037db265ecdd914a26e056cf69207b4f50924ehkuang step2[21] = _mm_sub_epi16(step3[22], step1[21]); 331791037db265ecdd914a26e056cf69207b4f50924ehkuang step2[22] = _mm_add_epi16(step1[21], step3[22]); 331891037db265ecdd914a26e056cf69207b4f50924ehkuang step2[23] = _mm_add_epi16(step1[20], step3[23]); 331991037db265ecdd914a26e056cf69207b4f50924ehkuang step2[24] = _mm_add_epi16(step1[27], step3[24]); 332091037db265ecdd914a26e056cf69207b4f50924ehkuang step2[25] = _mm_add_epi16(step1[26], step3[25]); 332191037db265ecdd914a26e056cf69207b4f50924ehkuang step2[26] = _mm_sub_epi16(step3[25], step1[26]); 332291037db265ecdd914a26e056cf69207b4f50924ehkuang step2[27] = _mm_sub_epi16(step3[24], step1[27]); 332391037db265ecdd914a26e056cf69207b4f50924ehkuang step2[28] = _mm_sub_epi16(step3[31], step1[28]); 332491037db265ecdd914a26e056cf69207b4f50924ehkuang step2[29] = _mm_sub_epi16(step3[30], step1[29]); 332591037db265ecdd914a26e056cf69207b4f50924ehkuang step2[30] = _mm_add_epi16(step1[29], step3[30]); 332691037db265ecdd914a26e056cf69207b4f50924ehkuang step2[31] = _mm_add_epi16(step1[28], step3[31]); 332791037db265ecdd914a26e056cf69207b4f50924ehkuang } 332891037db265ecdd914a26e056cf69207b4f50924ehkuang // Stage 6 332991037db265ecdd914a26e056cf69207b4f50924ehkuang { 333091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]); 333191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]); 333291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]); 333391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]); 333491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]); 333591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]); 333691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]); 333791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]); 333891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04); 333991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04); 334091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20); 334191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20); 334291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12); 334391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12); 334491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28); 334591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28); 334691037db265ecdd914a26e056cf69207b4f50924ehkuang // dct_const_round_shift 334791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING); 334891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING); 334991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING); 335091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING); 335191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING); 335291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING); 335391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING); 335491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING); 335591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS); 335691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS); 335791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS); 335891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS); 335991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS); 336091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS); 336191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS); 336291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS); 336391037db265ecdd914a26e056cf69207b4f50924ehkuang // Combine 336491037db265ecdd914a26e056cf69207b4f50924ehkuang out[ 4] = _mm_packs_epi32(out_04_6, out_04_7); 336591037db265ecdd914a26e056cf69207b4f50924ehkuang out[20] = _mm_packs_epi32(out_20_6, out_20_7); 336691037db265ecdd914a26e056cf69207b4f50924ehkuang out[12] = _mm_packs_epi32(out_12_6, out_12_7); 336791037db265ecdd914a26e056cf69207b4f50924ehkuang out[28] = _mm_packs_epi32(out_28_6, out_28_7); 336891037db265ecdd914a26e056cf69207b4f50924ehkuang } 336991037db265ecdd914a26e056cf69207b4f50924ehkuang { 337091037db265ecdd914a26e056cf69207b4f50924ehkuang step3[ 8] = _mm_add_epi16(step2[ 9], step1[ 8]); 337191037db265ecdd914a26e056cf69207b4f50924ehkuang step3[ 9] = _mm_sub_epi16(step1[ 8], step2[ 9]); 337291037db265ecdd914a26e056cf69207b4f50924ehkuang step3[10] = _mm_sub_epi16(step1[11], step2[10]); 337391037db265ecdd914a26e056cf69207b4f50924ehkuang step3[11] = _mm_add_epi16(step2[10], step1[11]); 337491037db265ecdd914a26e056cf69207b4f50924ehkuang step3[12] = _mm_add_epi16(step2[13], step1[12]); 337591037db265ecdd914a26e056cf69207b4f50924ehkuang step3[13] = _mm_sub_epi16(step1[12], step2[13]); 337691037db265ecdd914a26e056cf69207b4f50924ehkuang step3[14] = _mm_sub_epi16(step1[15], step2[14]); 337791037db265ecdd914a26e056cf69207b4f50924ehkuang step3[15] = _mm_add_epi16(step2[14], step1[15]); 337891037db265ecdd914a26e056cf69207b4f50924ehkuang } 337991037db265ecdd914a26e056cf69207b4f50924ehkuang { 338091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]); 338191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]); 338291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]); 338391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]); 338491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]); 338591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]); 338691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]); 338791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]); 338891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28); 338991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28); 339091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04); 339191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04); 339291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12); 339391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12); 339491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20); 339591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20); 339691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12); 339791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12); 339891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20); 339991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20); 340091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28); 340191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28); 340291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04); 340391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04); 340491037db265ecdd914a26e056cf69207b4f50924ehkuang // dct_const_round_shift 340591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING); 340691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING); 340791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING); 340891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING); 340991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING); 341091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING); 341191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING); 341291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING); 341391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS); 341491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS); 341591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS); 341691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS); 341791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS); 341891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS); 341991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS); 342091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS); 342191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING); 342291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING); 342391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING); 342491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING); 342591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING); 342691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING); 342791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING); 342891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING); 342991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS); 343091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS); 343191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS); 343291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS); 343391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS); 343491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS); 343591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS); 343691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS); 343791037db265ecdd914a26e056cf69207b4f50924ehkuang // Combine 343891037db265ecdd914a26e056cf69207b4f50924ehkuang step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7); 343991037db265ecdd914a26e056cf69207b4f50924ehkuang step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7); 344091037db265ecdd914a26e056cf69207b4f50924ehkuang step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7); 344191037db265ecdd914a26e056cf69207b4f50924ehkuang step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7); 344291037db265ecdd914a26e056cf69207b4f50924ehkuang // Combine 344391037db265ecdd914a26e056cf69207b4f50924ehkuang step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7); 344491037db265ecdd914a26e056cf69207b4f50924ehkuang step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7); 344591037db265ecdd914a26e056cf69207b4f50924ehkuang step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7); 344691037db265ecdd914a26e056cf69207b4f50924ehkuang step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7); 344791037db265ecdd914a26e056cf69207b4f50924ehkuang } 344891037db265ecdd914a26e056cf69207b4f50924ehkuang // Stage 7 344991037db265ecdd914a26e056cf69207b4f50924ehkuang { 345091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_02_0 = _mm_unpacklo_epi16(step3[ 8], step3[15]); 345191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_02_1 = _mm_unpackhi_epi16(step3[ 8], step3[15]); 345291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_18_0 = _mm_unpacklo_epi16(step3[ 9], step3[14]); 345391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_18_1 = _mm_unpackhi_epi16(step3[ 9], step3[14]); 345491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]); 345591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]); 345691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]); 345791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]); 345891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02); 345991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02); 346091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18); 346191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18); 346291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10); 346391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10); 346491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26); 346591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26); 346691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06); 346791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06); 346891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22); 346991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22); 347091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14); 347191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14); 347291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30); 347391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30); 347491037db265ecdd914a26e056cf69207b4f50924ehkuang // dct_const_round_shift 347591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING); 347691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING); 347791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING); 347891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING); 347991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING); 348091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING); 348191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING); 348291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING); 348391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING); 348491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING); 348591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING); 348691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING); 348791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING); 348891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING); 348991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING); 349091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING); 349191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS); 349291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS); 349391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS); 349491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS); 349591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS); 349691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS); 349791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS); 349891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS); 349991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS); 350091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS); 350191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS); 350291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS); 350391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS); 350491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS); 350591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS); 350691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS); 350791037db265ecdd914a26e056cf69207b4f50924ehkuang // Combine 350891037db265ecdd914a26e056cf69207b4f50924ehkuang out[ 2] = _mm_packs_epi32(out_02_6, out_02_7); 350991037db265ecdd914a26e056cf69207b4f50924ehkuang out[18] = _mm_packs_epi32(out_18_6, out_18_7); 351091037db265ecdd914a26e056cf69207b4f50924ehkuang out[10] = _mm_packs_epi32(out_10_6, out_10_7); 351191037db265ecdd914a26e056cf69207b4f50924ehkuang out[26] = _mm_packs_epi32(out_26_6, out_26_7); 351291037db265ecdd914a26e056cf69207b4f50924ehkuang out[ 6] = _mm_packs_epi32(out_06_6, out_06_7); 351391037db265ecdd914a26e056cf69207b4f50924ehkuang out[22] = _mm_packs_epi32(out_22_6, out_22_7); 351491037db265ecdd914a26e056cf69207b4f50924ehkuang out[14] = _mm_packs_epi32(out_14_6, out_14_7); 351591037db265ecdd914a26e056cf69207b4f50924ehkuang out[30] = _mm_packs_epi32(out_30_6, out_30_7); 351691037db265ecdd914a26e056cf69207b4f50924ehkuang } 351791037db265ecdd914a26e056cf69207b4f50924ehkuang { 351891037db265ecdd914a26e056cf69207b4f50924ehkuang step1[16] = _mm_add_epi16(step3[17], step2[16]); 351991037db265ecdd914a26e056cf69207b4f50924ehkuang step1[17] = _mm_sub_epi16(step2[16], step3[17]); 352091037db265ecdd914a26e056cf69207b4f50924ehkuang step1[18] = _mm_sub_epi16(step2[19], step3[18]); 352191037db265ecdd914a26e056cf69207b4f50924ehkuang step1[19] = _mm_add_epi16(step3[18], step2[19]); 352291037db265ecdd914a26e056cf69207b4f50924ehkuang step1[20] = _mm_add_epi16(step3[21], step2[20]); 352391037db265ecdd914a26e056cf69207b4f50924ehkuang step1[21] = _mm_sub_epi16(step2[20], step3[21]); 352491037db265ecdd914a26e056cf69207b4f50924ehkuang step1[22] = _mm_sub_epi16(step2[23], step3[22]); 352591037db265ecdd914a26e056cf69207b4f50924ehkuang step1[23] = _mm_add_epi16(step3[22], step2[23]); 352691037db265ecdd914a26e056cf69207b4f50924ehkuang step1[24] = _mm_add_epi16(step3[25], step2[24]); 352791037db265ecdd914a26e056cf69207b4f50924ehkuang step1[25] = _mm_sub_epi16(step2[24], step3[25]); 352891037db265ecdd914a26e056cf69207b4f50924ehkuang step1[26] = _mm_sub_epi16(step2[27], step3[26]); 352991037db265ecdd914a26e056cf69207b4f50924ehkuang step1[27] = _mm_add_epi16(step3[26], step2[27]); 353091037db265ecdd914a26e056cf69207b4f50924ehkuang step1[28] = _mm_add_epi16(step3[29], step2[28]); 353191037db265ecdd914a26e056cf69207b4f50924ehkuang step1[29] = _mm_sub_epi16(step2[28], step3[29]); 353291037db265ecdd914a26e056cf69207b4f50924ehkuang step1[30] = _mm_sub_epi16(step2[31], step3[30]); 353391037db265ecdd914a26e056cf69207b4f50924ehkuang step1[31] = _mm_add_epi16(step3[30], step2[31]); 353491037db265ecdd914a26e056cf69207b4f50924ehkuang } 353591037db265ecdd914a26e056cf69207b4f50924ehkuang // Final stage --- outputs indices are bit-reversed. 353691037db265ecdd914a26e056cf69207b4f50924ehkuang { 353791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]); 353891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]); 353991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]); 354091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]); 354191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]); 354291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]); 354391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]); 354491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]); 354591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01); 354691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01); 354791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17); 354891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17); 354991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09); 355091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09); 355191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25); 355291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25); 355391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07); 355491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07); 355591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23); 355691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23); 355791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15); 355891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15); 355991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31); 356091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31); 356191037db265ecdd914a26e056cf69207b4f50924ehkuang // dct_const_round_shift 356291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING); 356391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING); 356491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING); 356591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING); 356691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING); 356791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING); 356891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING); 356991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING); 357091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING); 357191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING); 357291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING); 357391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING); 357491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING); 357591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING); 357691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING); 357791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING); 357891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS); 357991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS); 358091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS); 358191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS); 358291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS); 358391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS); 358491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS); 358591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS); 358691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS); 358791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS); 358891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS); 358991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS); 359091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS); 359191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS); 359291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS); 359391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS); 359491037db265ecdd914a26e056cf69207b4f50924ehkuang // Combine 359591037db265ecdd914a26e056cf69207b4f50924ehkuang out[ 1] = _mm_packs_epi32(out_01_6, out_01_7); 359691037db265ecdd914a26e056cf69207b4f50924ehkuang out[17] = _mm_packs_epi32(out_17_6, out_17_7); 359791037db265ecdd914a26e056cf69207b4f50924ehkuang out[ 9] = _mm_packs_epi32(out_09_6, out_09_7); 359891037db265ecdd914a26e056cf69207b4f50924ehkuang out[25] = _mm_packs_epi32(out_25_6, out_25_7); 359991037db265ecdd914a26e056cf69207b4f50924ehkuang out[ 7] = _mm_packs_epi32(out_07_6, out_07_7); 360091037db265ecdd914a26e056cf69207b4f50924ehkuang out[23] = _mm_packs_epi32(out_23_6, out_23_7); 360191037db265ecdd914a26e056cf69207b4f50924ehkuang out[15] = _mm_packs_epi32(out_15_6, out_15_7); 360291037db265ecdd914a26e056cf69207b4f50924ehkuang out[31] = _mm_packs_epi32(out_31_6, out_31_7); 360391037db265ecdd914a26e056cf69207b4f50924ehkuang } 360491037db265ecdd914a26e056cf69207b4f50924ehkuang { 360591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]); 360691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]); 360791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]); 360891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]); 360991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]); 361091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]); 361191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]); 361291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]); 361391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05); 361491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05); 361591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21); 361691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21); 361791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13); 361891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13); 361991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29); 362091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29); 362191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03); 362291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03); 362391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19); 362491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19); 362591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11); 362691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11); 362791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27); 362891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27); 362991037db265ecdd914a26e056cf69207b4f50924ehkuang // dct_const_round_shift 363091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING); 363191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING); 363291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING); 363391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING); 363491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING); 363591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING); 363691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING); 363791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING); 363891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING); 363991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING); 364091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING); 364191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING); 364291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING); 364391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING); 364491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING); 364591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING); 364691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS); 364791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS); 364891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS); 364991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS); 365091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS); 365191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS); 365291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS); 365391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS); 365491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS); 365591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS); 365691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS); 365791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS); 365891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS); 365991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS); 366091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS); 366191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS); 366291037db265ecdd914a26e056cf69207b4f50924ehkuang // Combine 366391037db265ecdd914a26e056cf69207b4f50924ehkuang out[ 5] = _mm_packs_epi32(out_05_6, out_05_7); 366491037db265ecdd914a26e056cf69207b4f50924ehkuang out[21] = _mm_packs_epi32(out_21_6, out_21_7); 366591037db265ecdd914a26e056cf69207b4f50924ehkuang out[13] = _mm_packs_epi32(out_13_6, out_13_7); 366691037db265ecdd914a26e056cf69207b4f50924ehkuang out[29] = _mm_packs_epi32(out_29_6, out_29_7); 366791037db265ecdd914a26e056cf69207b4f50924ehkuang out[ 3] = _mm_packs_epi32(out_03_6, out_03_7); 366891037db265ecdd914a26e056cf69207b4f50924ehkuang out[19] = _mm_packs_epi32(out_19_6, out_19_7); 366991037db265ecdd914a26e056cf69207b4f50924ehkuang out[11] = _mm_packs_epi32(out_11_6, out_11_7); 367091037db265ecdd914a26e056cf69207b4f50924ehkuang out[27] = _mm_packs_epi32(out_27_6, out_27_7); 367191037db265ecdd914a26e056cf69207b4f50924ehkuang } 367291037db265ecdd914a26e056cf69207b4f50924ehkuang // Transpose the results, do it as four 8x8 transposes. 367391037db265ecdd914a26e056cf69207b4f50924ehkuang { 367491037db265ecdd914a26e056cf69207b4f50924ehkuang int transpose_block; 367591037db265ecdd914a26e056cf69207b4f50924ehkuang int16_t *output; 367691037db265ecdd914a26e056cf69207b4f50924ehkuang if (0 == pass) { 367791037db265ecdd914a26e056cf69207b4f50924ehkuang output = &intermediate[column_start * 32]; 367891037db265ecdd914a26e056cf69207b4f50924ehkuang } else { 367991037db265ecdd914a26e056cf69207b4f50924ehkuang output = &output_org[column_start * 32]; 368091037db265ecdd914a26e056cf69207b4f50924ehkuang } 368191037db265ecdd914a26e056cf69207b4f50924ehkuang for (transpose_block = 0; transpose_block < 4; ++transpose_block) { 368291037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i *this_out = &out[8 * transpose_block]; 368391037db265ecdd914a26e056cf69207b4f50924ehkuang // 00 01 02 03 04 05 06 07 368491037db265ecdd914a26e056cf69207b4f50924ehkuang // 10 11 12 13 14 15 16 17 368591037db265ecdd914a26e056cf69207b4f50924ehkuang // 20 21 22 23 24 25 26 27 368691037db265ecdd914a26e056cf69207b4f50924ehkuang // 30 31 32 33 34 35 36 37 368791037db265ecdd914a26e056cf69207b4f50924ehkuang // 40 41 42 43 44 45 46 47 368891037db265ecdd914a26e056cf69207b4f50924ehkuang // 50 51 52 53 54 55 56 57 368991037db265ecdd914a26e056cf69207b4f50924ehkuang // 60 61 62 63 64 65 66 67 369091037db265ecdd914a26e056cf69207b4f50924ehkuang // 70 71 72 73 74 75 76 77 369191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]); 369291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]); 369391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]); 369491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]); 369591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]); 369691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]); 369791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]); 369891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]); 369991037db265ecdd914a26e056cf69207b4f50924ehkuang // 00 10 01 11 02 12 03 13 370091037db265ecdd914a26e056cf69207b4f50924ehkuang // 20 30 21 31 22 32 23 33 370191037db265ecdd914a26e056cf69207b4f50924ehkuang // 04 14 05 15 06 16 07 17 370291037db265ecdd914a26e056cf69207b4f50924ehkuang // 24 34 25 35 26 36 27 37 370391037db265ecdd914a26e056cf69207b4f50924ehkuang // 40 50 41 51 42 52 43 53 370491037db265ecdd914a26e056cf69207b4f50924ehkuang // 60 70 61 71 62 72 63 73 370591037db265ecdd914a26e056cf69207b4f50924ehkuang // 54 54 55 55 56 56 57 57 370691037db265ecdd914a26e056cf69207b4f50924ehkuang // 64 74 65 75 66 76 67 77 370791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 370891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); 370991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 371091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); 371191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); 371291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 371391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); 371491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 371591037db265ecdd914a26e056cf69207b4f50924ehkuang // 00 10 20 30 01 11 21 31 371691037db265ecdd914a26e056cf69207b4f50924ehkuang // 40 50 60 70 41 51 61 71 371791037db265ecdd914a26e056cf69207b4f50924ehkuang // 02 12 22 32 03 13 23 33 371891037db265ecdd914a26e056cf69207b4f50924ehkuang // 42 52 62 72 43 53 63 73 371991037db265ecdd914a26e056cf69207b4f50924ehkuang // 04 14 24 34 05 15 21 36 372091037db265ecdd914a26e056cf69207b4f50924ehkuang // 44 54 64 74 45 55 61 76 372191037db265ecdd914a26e056cf69207b4f50924ehkuang // 06 16 26 36 07 17 27 37 372291037db265ecdd914a26e056cf69207b4f50924ehkuang // 46 56 66 76 47 57 67 77 372391037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); 372491037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); 372591037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); 372691037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); 372791037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); 372891037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); 372991037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); 373091037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); 373191037db265ecdd914a26e056cf69207b4f50924ehkuang // 00 10 20 30 40 50 60 70 373291037db265ecdd914a26e056cf69207b4f50924ehkuang // 01 11 21 31 41 51 61 71 373391037db265ecdd914a26e056cf69207b4f50924ehkuang // 02 12 22 32 42 52 62 72 373491037db265ecdd914a26e056cf69207b4f50924ehkuang // 03 13 23 33 43 53 63 73 373591037db265ecdd914a26e056cf69207b4f50924ehkuang // 04 14 24 34 44 54 64 74 373691037db265ecdd914a26e056cf69207b4f50924ehkuang // 05 15 25 35 45 55 65 75 373791037db265ecdd914a26e056cf69207b4f50924ehkuang // 06 16 26 36 46 56 66 76 373891037db265ecdd914a26e056cf69207b4f50924ehkuang // 07 17 27 37 47 57 67 77 373991037db265ecdd914a26e056cf69207b4f50924ehkuang if (0 == pass) { 374091037db265ecdd914a26e056cf69207b4f50924ehkuang // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2; 374191037db265ecdd914a26e056cf69207b4f50924ehkuang // TODO(cd): see quality impact of only doing 374291037db265ecdd914a26e056cf69207b4f50924ehkuang // output[j] = (output[j] + 1) >> 2; 374391037db265ecdd914a26e056cf69207b4f50924ehkuang // which would remove the code between here ... 374491037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero); 374591037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero); 374691037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero); 374791037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero); 374891037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero); 374991037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero); 375091037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero); 375191037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero); 375291037db265ecdd914a26e056cf69207b4f50924ehkuang tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0); 375391037db265ecdd914a26e056cf69207b4f50924ehkuang tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0); 375491037db265ecdd914a26e056cf69207b4f50924ehkuang tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0); 375591037db265ecdd914a26e056cf69207b4f50924ehkuang tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0); 375691037db265ecdd914a26e056cf69207b4f50924ehkuang tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0); 375791037db265ecdd914a26e056cf69207b4f50924ehkuang tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0); 375891037db265ecdd914a26e056cf69207b4f50924ehkuang tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0); 375991037db265ecdd914a26e056cf69207b4f50924ehkuang tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0); 376091037db265ecdd914a26e056cf69207b4f50924ehkuang // ... and here. 376191037db265ecdd914a26e056cf69207b4f50924ehkuang // PS: also change code in vp9/encoder/vp9_dct.c 376291037db265ecdd914a26e056cf69207b4f50924ehkuang tr2_0 = _mm_add_epi16(tr2_0, kOne); 376391037db265ecdd914a26e056cf69207b4f50924ehkuang tr2_1 = _mm_add_epi16(tr2_1, kOne); 376491037db265ecdd914a26e056cf69207b4f50924ehkuang tr2_2 = _mm_add_epi16(tr2_2, kOne); 376591037db265ecdd914a26e056cf69207b4f50924ehkuang tr2_3 = _mm_add_epi16(tr2_3, kOne); 376691037db265ecdd914a26e056cf69207b4f50924ehkuang tr2_4 = _mm_add_epi16(tr2_4, kOne); 376791037db265ecdd914a26e056cf69207b4f50924ehkuang tr2_5 = _mm_add_epi16(tr2_5, kOne); 376891037db265ecdd914a26e056cf69207b4f50924ehkuang tr2_6 = _mm_add_epi16(tr2_6, kOne); 376991037db265ecdd914a26e056cf69207b4f50924ehkuang tr2_7 = _mm_add_epi16(tr2_7, kOne); 377091037db265ecdd914a26e056cf69207b4f50924ehkuang tr2_0 = _mm_srai_epi16(tr2_0, 2); 377191037db265ecdd914a26e056cf69207b4f50924ehkuang tr2_1 = _mm_srai_epi16(tr2_1, 2); 377291037db265ecdd914a26e056cf69207b4f50924ehkuang tr2_2 = _mm_srai_epi16(tr2_2, 2); 377391037db265ecdd914a26e056cf69207b4f50924ehkuang tr2_3 = _mm_srai_epi16(tr2_3, 2); 377491037db265ecdd914a26e056cf69207b4f50924ehkuang tr2_4 = _mm_srai_epi16(tr2_4, 2); 377591037db265ecdd914a26e056cf69207b4f50924ehkuang tr2_5 = _mm_srai_epi16(tr2_5, 2); 377691037db265ecdd914a26e056cf69207b4f50924ehkuang tr2_6 = _mm_srai_epi16(tr2_6, 2); 377791037db265ecdd914a26e056cf69207b4f50924ehkuang tr2_7 = _mm_srai_epi16(tr2_7, 2); 377891037db265ecdd914a26e056cf69207b4f50924ehkuang } 377991037db265ecdd914a26e056cf69207b4f50924ehkuang // Note: even though all these stores are aligned, using the aligned 378091037db265ecdd914a26e056cf69207b4f50924ehkuang // intrinsic make the code slightly slower. 378191037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_storeu_si128((__m128i *)(output + 0 * 32), tr2_0); 378291037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_storeu_si128((__m128i *)(output + 1 * 32), tr2_1); 378391037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_storeu_si128((__m128i *)(output + 2 * 32), tr2_2); 378491037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_storeu_si128((__m128i *)(output + 3 * 32), tr2_3); 378591037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_storeu_si128((__m128i *)(output + 4 * 32), tr2_4); 378691037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_storeu_si128((__m128i *)(output + 5 * 32), tr2_5); 378791037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_storeu_si128((__m128i *)(output + 6 * 32), tr2_6); 378891037db265ecdd914a26e056cf69207b4f50924ehkuang _mm_storeu_si128((__m128i *)(output + 7 * 32), tr2_7); 378991037db265ecdd914a26e056cf69207b4f50924ehkuang // Process next 8x8 379091037db265ecdd914a26e056cf69207b4f50924ehkuang output += 8; 379191037db265ecdd914a26e056cf69207b4f50924ehkuang } 379291037db265ecdd914a26e056cf69207b4f50924ehkuang } 379391037db265ecdd914a26e056cf69207b4f50924ehkuang } 379491037db265ecdd914a26e056cf69207b4f50924ehkuang } 379591037db265ecdd914a26e056cf69207b4f50924ehkuang} 3796