1ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang/* 2ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * 4ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * Use of this source code is governed by a BSD-style license 5ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * that can be found in the LICENSE file in the root of the source 6ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * tree. An additional intellectual property rights grant can be found 7ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * in the file PATENTS. All contributing project authors may 8ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * be found in the AUTHORS file in the root of the source tree. 9ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang */ 10ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 11ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include <assert.h> 12ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include <emmintrin.h> // SSE2 13ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "./vpx_config.h" 14ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vpx/vpx_integer.h" 15ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp9/common/vp9_common.h" 16ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp9/common/vp9_idct.h" 17ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 189b35249446b07f40ac5fcc3205f2c048616efacchkuang#define RECON_AND_STORE4X4(dest, in_x) \ 199b35249446b07f40ac5fcc3205f2c048616efacchkuang{ \ 209b35249446b07f40ac5fcc3205f2c048616efacchkuang __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ 219b35249446b07f40ac5fcc3205f2c048616efacchkuang d0 = _mm_unpacklo_epi8(d0, zero); \ 229b35249446b07f40ac5fcc3205f2c048616efacchkuang d0 = _mm_add_epi16(in_x, d0); \ 239b35249446b07f40ac5fcc3205f2c048616efacchkuang d0 = _mm_packus_epi16(d0, d0); \ 249b35249446b07f40ac5fcc3205f2c048616efacchkuang *(int *)dest = _mm_cvtsi128_si32(d0); \ 259b35249446b07f40ac5fcc3205f2c048616efacchkuang dest += stride; \ 269b35249446b07f40ac5fcc3205f2c048616efacchkuang} 279b35249446b07f40ac5fcc3205f2c048616efacchkuang 285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 29ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i zero = _mm_setzero_si128(); 30ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i eight = _mm_set1_epi16(8); 31ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, 32ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang (int16_t)cospi_16_64, (int16_t)-cospi_16_64, 33ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang (int16_t)cospi_24_64, (int16_t)-cospi_8_64, 34ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang (int16_t)cospi_8_64, (int16_t)cospi_24_64); 35ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 36ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i input0, input1, input2, input3; 37ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 38ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Rows 399b35249446b07f40ac5fcc3205f2c048616efacchkuang input0 = _mm_load_si128((const __m128i *)input); 409b35249446b07f40ac5fcc3205f2c048616efacchkuang input2 = _mm_load_si128((const __m128i *)(input + 8)); 41ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 42ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Construct i3, i1, i3, i1, i2, i0, i2, i0 43ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input0 = _mm_shufflelo_epi16(input0, 0xd8); 449b35249446b07f40ac5fcc3205f2c048616efacchkuang input0 = _mm_shufflehi_epi16(input0, 0xd8); 45ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input2 = _mm_shufflelo_epi16(input2, 0xd8); 469b35249446b07f40ac5fcc3205f2c048616efacchkuang input2 = _mm_shufflehi_epi16(input2, 0xd8); 47ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 489b35249446b07f40ac5fcc3205f2c048616efacchkuang input1 = _mm_unpackhi_epi32(input0, input0); 49ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input0 = _mm_unpacklo_epi32(input0, input0); 509b35249446b07f40ac5fcc3205f2c048616efacchkuang input3 = _mm_unpackhi_epi32(input2, input2); 51ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input2 = _mm_unpacklo_epi32(input2, input2); 52ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 53ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage 1 54ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input0 = _mm_madd_epi16(input0, cst); 55ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input1 = _mm_madd_epi16(input1, cst); 56ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input2 = _mm_madd_epi16(input2, cst); 57ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input3 = _mm_madd_epi16(input3, cst); 58ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 59ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input0 = _mm_add_epi32(input0, rounding); 60ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input1 = _mm_add_epi32(input1, rounding); 61ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input2 = _mm_add_epi32(input2, rounding); 62ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input3 = _mm_add_epi32(input3, rounding); 63ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 64ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); 65ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); 66ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); 67ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); 68ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 69ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage 2 709b35249446b07f40ac5fcc3205f2c048616efacchkuang input0 = _mm_packs_epi32(input0, input1); 719b35249446b07f40ac5fcc3205f2c048616efacchkuang input1 = _mm_packs_epi32(input2, input3); 72ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 73ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Transpose 749b35249446b07f40ac5fcc3205f2c048616efacchkuang input2 = _mm_unpacklo_epi16(input0, input1); 759b35249446b07f40ac5fcc3205f2c048616efacchkuang input3 = _mm_unpackhi_epi16(input0, input1); 769b35249446b07f40ac5fcc3205f2c048616efacchkuang input0 = _mm_unpacklo_epi32(input2, input3); 779b35249446b07f40ac5fcc3205f2c048616efacchkuang input1 = _mm_unpackhi_epi32(input2, input3); 78ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 79ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Switch column2, column 3, and then, we got: 80ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // input2: column1, column 0; input3: column2, column 3. 81ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input1 = _mm_shuffle_epi32(input1, 0x4e); 82ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input2 = _mm_add_epi16(input0, input1); 83ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input3 = _mm_sub_epi16(input0, input1); 84ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 85ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Columns 86ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Construct i3, i1, i3, i1, i2, i0, i2, i0 879b35249446b07f40ac5fcc3205f2c048616efacchkuang input0 = _mm_unpacklo_epi32(input2, input2); 889b35249446b07f40ac5fcc3205f2c048616efacchkuang input1 = _mm_unpackhi_epi32(input2, input2); 899b35249446b07f40ac5fcc3205f2c048616efacchkuang input2 = _mm_unpackhi_epi32(input3, input3); 90ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input3 = _mm_unpacklo_epi32(input3, input3); 91ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 92ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage 1 93ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input0 = _mm_madd_epi16(input0, cst); 94ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input1 = _mm_madd_epi16(input1, cst); 95ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input2 = _mm_madd_epi16(input2, cst); 96ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input3 = _mm_madd_epi16(input3, cst); 97ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 98ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input0 = _mm_add_epi32(input0, rounding); 99ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input1 = _mm_add_epi32(input1, rounding); 100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input2 = _mm_add_epi32(input2, rounding); 101ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input3 = _mm_add_epi32(input3, rounding); 102ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 103ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); 104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); 105ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); 106ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); 107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage 2 1099b35249446b07f40ac5fcc3205f2c048616efacchkuang input0 = _mm_packs_epi32(input0, input2); 1109b35249446b07f40ac5fcc3205f2c048616efacchkuang input1 = _mm_packs_epi32(input1, input3); 111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Transpose 1139b35249446b07f40ac5fcc3205f2c048616efacchkuang input2 = _mm_unpacklo_epi16(input0, input1); 1149b35249446b07f40ac5fcc3205f2c048616efacchkuang input3 = _mm_unpackhi_epi16(input0, input1); 1159b35249446b07f40ac5fcc3205f2c048616efacchkuang input0 = _mm_unpacklo_epi32(input2, input3); 1169b35249446b07f40ac5fcc3205f2c048616efacchkuang input1 = _mm_unpackhi_epi32(input2, input3); 117ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 118ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Switch column2, column 3, and then, we got: 119ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // input2: column1, column 0; input3: column2, column 3. 120ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input1 = _mm_shuffle_epi32(input1, 0x4e); 121ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input2 = _mm_add_epi16(input0, input1); 122ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input3 = _mm_sub_epi16(input0, input1); 123ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 124ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Final round and shift 125ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input2 = _mm_add_epi16(input2, eight); 126ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input3 = _mm_add_epi16(input3, eight); 127ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 128ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input2 = _mm_srai_epi16(input2, 4); 129ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input3 = _mm_srai_epi16(input3, 4); 130ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1319b35249446b07f40ac5fcc3205f2c048616efacchkuang // Reconstruction and Store 1329b35249446b07f40ac5fcc3205f2c048616efacchkuang { 1339b35249446b07f40ac5fcc3205f2c048616efacchkuang __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); 1349b35249446b07f40ac5fcc3205f2c048616efacchkuang __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); 1359b35249446b07f40ac5fcc3205f2c048616efacchkuang d0 = _mm_unpacklo_epi32(d0, 1369b35249446b07f40ac5fcc3205f2c048616efacchkuang _mm_cvtsi32_si128(*(const int *) (dest + stride))); 1379b35249446b07f40ac5fcc3205f2c048616efacchkuang d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128( 1389b35249446b07f40ac5fcc3205f2c048616efacchkuang *(const int *) (dest + stride * 3)), d2); 1399b35249446b07f40ac5fcc3205f2c048616efacchkuang d0 = _mm_unpacklo_epi8(d0, zero); 1409b35249446b07f40ac5fcc3205f2c048616efacchkuang d2 = _mm_unpacklo_epi8(d2, zero); 1419b35249446b07f40ac5fcc3205f2c048616efacchkuang d0 = _mm_add_epi16(d0, input2); 1429b35249446b07f40ac5fcc3205f2c048616efacchkuang d2 = _mm_add_epi16(d2, input3); 1439b35249446b07f40ac5fcc3205f2c048616efacchkuang d0 = _mm_packus_epi16(d0, d2); 1449b35249446b07f40ac5fcc3205f2c048616efacchkuang // store input0 1459b35249446b07f40ac5fcc3205f2c048616efacchkuang *(int *)dest = _mm_cvtsi128_si32(d0); 1469b35249446b07f40ac5fcc3205f2c048616efacchkuang // store input1 1479b35249446b07f40ac5fcc3205f2c048616efacchkuang d0 = _mm_srli_si128(d0, 4); 1489b35249446b07f40ac5fcc3205f2c048616efacchkuang *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); 1499b35249446b07f40ac5fcc3205f2c048616efacchkuang // store input2 1509b35249446b07f40ac5fcc3205f2c048616efacchkuang d0 = _mm_srli_si128(d0, 4); 1519b35249446b07f40ac5fcc3205f2c048616efacchkuang *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); 1529b35249446b07f40ac5fcc3205f2c048616efacchkuang // store input3 1539b35249446b07f40ac5fcc3205f2c048616efacchkuang d0 = _mm_srli_si128(d0, 4); 1549b35249446b07f40ac5fcc3205f2c048616efacchkuang *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); 155ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 156ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang} 157ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 15991037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i dc_value; 16091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i zero = _mm_setzero_si128(); 16191037db265ecdd914a26e056cf69207b4f50924ehkuang int a; 16291037db265ecdd914a26e056cf69207b4f50924ehkuang 16391037db265ecdd914a26e056cf69207b4f50924ehkuang a = dct_const_round_shift(input[0] * cospi_16_64); 16491037db265ecdd914a26e056cf69207b4f50924ehkuang a = dct_const_round_shift(a * cospi_16_64); 16591037db265ecdd914a26e056cf69207b4f50924ehkuang a = ROUND_POWER_OF_TWO(a, 4); 16691037db265ecdd914a26e056cf69207b4f50924ehkuang 16791037db265ecdd914a26e056cf69207b4f50924ehkuang dc_value = _mm_set1_epi16(a); 16891037db265ecdd914a26e056cf69207b4f50924ehkuang 16991037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE4X4(dest, dc_value); 17091037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE4X4(dest, dc_value); 17191037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE4X4(dest, dc_value); 17291037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE4X4(dest, dc_value); 17391037db265ecdd914a26e056cf69207b4f50924ehkuang} 17491037db265ecdd914a26e056cf69207b4f50924ehkuang 17591037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void transpose_4x4(__m128i *res) { 17691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); 177b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); 17891037db265ecdd914a26e056cf69207b4f50924ehkuang 179b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); 180b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); 18191037db265ecdd914a26e056cf69207b4f50924ehkuang} 18291037db265ecdd914a26e056cf69207b4f50924ehkuang 183b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void idct4_sse2(__m128i *in) { 18491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); 18591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 18691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 18791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 18891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 18991037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i u[8], v[8]; 19091037db265ecdd914a26e056cf69207b4f50924ehkuang 19191037db265ecdd914a26e056cf69207b4f50924ehkuang transpose_4x4(in); 19291037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 1 193b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_unpacklo_epi16(in[0], in[1]); 194b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_unpackhi_epi16(in[0], in[1]); 19591037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 19691037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 19791037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); 19891037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24); 19991037db265ecdd914a26e056cf69207b4f50924ehkuang 20091037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 20191037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 20291037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 20391037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 20491037db265ecdd914a26e056cf69207b4f50924ehkuang 20591037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 20691037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 20791037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 20891037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 20991037db265ecdd914a26e056cf69207b4f50924ehkuang 210b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_packs_epi32(v[0], v[1]); 211b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_packs_epi32(v[3], v[2]); 21291037db265ecdd914a26e056cf69207b4f50924ehkuang 21391037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 2 214b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[0] = _mm_add_epi16(u[0], u[1]); 215b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[1] = _mm_sub_epi16(u[0], u[1]); 216b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[1] = _mm_shuffle_epi32(in[1], 0x4E); 21791037db265ecdd914a26e056cf69207b4f50924ehkuang} 21891037db265ecdd914a26e056cf69207b4f50924ehkuang 219b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void iadst4_sse2(__m128i *in) { 22091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); 22191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); 22291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); 22391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); 22491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9); 22591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i kZero = _mm_set1_epi16(0); 22691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 22791037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i u[8], v[8], in7; 22891037db265ecdd914a26e056cf69207b4f50924ehkuang 22991037db265ecdd914a26e056cf69207b4f50924ehkuang transpose_4x4(in); 230b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in7 = _mm_srli_si128(in[1], 8); 231b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in7 = _mm_add_epi16(in7, in[0]); 232b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in7 = _mm_sub_epi16(in7, in[1]); 23391037db265ecdd914a26e056cf69207b4f50924ehkuang 234b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_unpacklo_epi16(in[0], in[1]); 235b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_unpackhi_epi16(in[0], in[1]); 23691037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_unpacklo_epi16(in7, kZero); 237b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[3] = _mm_unpackhi_epi16(in[0], kZero); 23891037db265ecdd914a26e056cf69207b4f50924ehkuang 23991037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 24091037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 24191037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2 24291037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4 24391037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6 24491037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2 24591037db265ecdd914a26e056cf69207b4f50924ehkuang 24691037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], v[1]); 24791037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(v[3], v[4]); 24891037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = v[2]; 24991037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(u[0], u[1]); 25091037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_slli_epi32(v[5], 2); 25191037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_add_epi32(u[3], v[5]); 25291037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_sub_epi32(u[5], u[4]); 25391037db265ecdd914a26e056cf69207b4f50924ehkuang 25491037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 25591037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 25691037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 25791037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 25891037db265ecdd914a26e056cf69207b4f50924ehkuang 25991037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 26091037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 26191037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 26291037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 26391037db265ecdd914a26e056cf69207b4f50924ehkuang 264b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[0] = _mm_packs_epi32(u[0], u[1]); 265b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[1] = _mm_packs_epi32(u[2], u[3]); 26691037db265ecdd914a26e056cf69207b4f50924ehkuang} 26791037db265ecdd914a26e056cf69207b4f50924ehkuang 2685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, 2695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int tx_type) { 270b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i in[2]; 27191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i zero = _mm_setzero_si128(); 27291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i eight = _mm_set1_epi16(8); 27391037db265ecdd914a26e056cf69207b4f50924ehkuang 274b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[0]= _mm_loadu_si128((const __m128i *)(input)); 275b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[1]= _mm_loadu_si128((const __m128i *)(input + 8)); 27691037db265ecdd914a26e056cf69207b4f50924ehkuang 27791037db265ecdd914a26e056cf69207b4f50924ehkuang switch (tx_type) { 27891037db265ecdd914a26e056cf69207b4f50924ehkuang case 0: // DCT_DCT 279b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian idct4_sse2(in); 280b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian idct4_sse2(in); 28191037db265ecdd914a26e056cf69207b4f50924ehkuang break; 28291037db265ecdd914a26e056cf69207b4f50924ehkuang case 1: // ADST_DCT 283b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian idct4_sse2(in); 284b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian iadst4_sse2(in); 28591037db265ecdd914a26e056cf69207b4f50924ehkuang break; 28691037db265ecdd914a26e056cf69207b4f50924ehkuang case 2: // DCT_ADST 287b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian iadst4_sse2(in); 288b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian idct4_sse2(in); 28991037db265ecdd914a26e056cf69207b4f50924ehkuang break; 29091037db265ecdd914a26e056cf69207b4f50924ehkuang case 3: // ADST_ADST 291b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian iadst4_sse2(in); 292b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian iadst4_sse2(in); 29391037db265ecdd914a26e056cf69207b4f50924ehkuang break; 29491037db265ecdd914a26e056cf69207b4f50924ehkuang default: 29591037db265ecdd914a26e056cf69207b4f50924ehkuang assert(0); 29691037db265ecdd914a26e056cf69207b4f50924ehkuang break; 29791037db265ecdd914a26e056cf69207b4f50924ehkuang } 29891037db265ecdd914a26e056cf69207b4f50924ehkuang 29991037db265ecdd914a26e056cf69207b4f50924ehkuang // Final round and shift 30091037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_add_epi16(in[0], eight); 30191037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_add_epi16(in[1], eight); 30291037db265ecdd914a26e056cf69207b4f50924ehkuang 30391037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_srai_epi16(in[0], 4); 30491037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_srai_epi16(in[1], 4); 30591037db265ecdd914a26e056cf69207b4f50924ehkuang 306b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Reconstruction and Store 307b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { 308b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); 309b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); 310b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian d0 = _mm_unpacklo_epi32(d0, 311b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_cvtsi32_si128(*(const int *) (dest + stride))); 312b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian d2 = _mm_unpacklo_epi32(d2, _mm_cvtsi32_si128( 313b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian *(const int *) (dest + stride * 3))); 314b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian d0 = _mm_unpacklo_epi8(d0, zero); 315b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian d2 = _mm_unpacklo_epi8(d2, zero); 316b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian d0 = _mm_add_epi16(d0, in[0]); 317b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian d2 = _mm_add_epi16(d2, in[1]); 318b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian d0 = _mm_packus_epi16(d0, d2); 319b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // store result[0] 320b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian *(int *)dest = _mm_cvtsi128_si32(d0); 321b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // store result[1] 322b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian d0 = _mm_srli_si128(d0, 4); 323b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); 324b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // store result[2] 325b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian d0 = _mm_srli_si128(d0, 4); 326b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); 327b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // store result[3] 328b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian d0 = _mm_srli_si128(d0, 4); 329b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); 330b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 33191037db265ecdd914a26e056cf69207b4f50924ehkuang} 33291037db265ecdd914a26e056cf69207b4f50924ehkuang 333ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ 334ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out0, out1, out2, out3, out4, out5, out6, out7) \ 335ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { \ 336ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ 337ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ 338ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ 339ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ 340ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ 341ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ 342ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ 343ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ 344ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 345ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ 346ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ 347ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ 348ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ 349ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ 350ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ 351ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ 352ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ 353ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 354ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ 355ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ 356ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ 357ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ 358ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ 359ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ 360ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ 361ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ 362ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 363ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 364b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \ 365b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian out0, out1, out2, out3) \ 366b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { \ 367b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \ 368b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \ 369b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \ 370b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \ 371b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 372ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ 373ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ 374ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ 375ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ 376b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 377ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ 378ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ 379ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ 380ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ 381ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 382ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 383b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ 384ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { \ 385ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ 386ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ 387ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 388ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ 389ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ 390b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 391b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 392b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \ 393b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { \ 394b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ 395b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ 396b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ 397b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ 398ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 399ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 400ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang// Define Macro for multiplying elements by constants and adding them together. 401ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \ 402ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang cst0, cst1, cst2, cst3, res0, res1, res2, res3) \ 403ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { \ 404ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_madd_epi16(lo_0, cst0); \ 405ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_madd_epi16(hi_0, cst0); \ 406ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_madd_epi16(lo_0, cst1); \ 407ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_madd_epi16(hi_0, cst1); \ 408ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_madd_epi16(lo_1, cst2); \ 409ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp5 = _mm_madd_epi16(hi_1, cst2); \ 410ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_madd_epi16(lo_1, cst3); \ 411ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp7 = _mm_madd_epi16(hi_1, cst3); \ 412ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 413ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_add_epi32(tmp0, rounding); \ 414ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_add_epi32(tmp1, rounding); \ 415ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_add_epi32(tmp2, rounding); \ 416ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_add_epi32(tmp3, rounding); \ 417ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_add_epi32(tmp4, rounding); \ 418ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp5 = _mm_add_epi32(tmp5, rounding); \ 419ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_add_epi32(tmp6, rounding); \ 420ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp7 = _mm_add_epi32(tmp7, rounding); \ 421ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 422ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 423ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 424ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 425ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 426ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ 427ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ 428ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ 429ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ 430ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 431ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res0 = _mm_packs_epi32(tmp0, tmp1); \ 432ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res1 = _mm_packs_epi32(tmp2, tmp3); \ 433ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res2 = _mm_packs_epi32(tmp4, tmp5); \ 434ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res3 = _mm_packs_epi32(tmp6, tmp7); \ 435ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 436ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 437b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \ 438b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { \ 439b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp0 = _mm_madd_epi16(lo_0, cst0); \ 440b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp1 = _mm_madd_epi16(hi_0, cst0); \ 441b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp2 = _mm_madd_epi16(lo_0, cst1); \ 442b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp3 = _mm_madd_epi16(hi_0, cst1); \ 443b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 444b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp0 = _mm_add_epi32(tmp0, rounding); \ 445b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp1 = _mm_add_epi32(tmp1, rounding); \ 446b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp2 = _mm_add_epi32(tmp2, rounding); \ 447b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp3 = _mm_add_epi32(tmp3, rounding); \ 448b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 449b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 450b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 451b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 452b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 453b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 454b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res0 = _mm_packs_epi32(tmp0, tmp1); \ 455b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res1 = _mm_packs_epi32(tmp2, tmp3); \ 456b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 457b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 458b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \ 459b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian out0, out1, out2, out3, out4, out5, out6, out7) \ 460b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { \ 461ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* Stage1 */ \ 462ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { \ 463ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ 464ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ 465ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ 466ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ 467ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 468ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \ 469ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg1_1, stg1_2, stg1_3, stp1_4, \ 470ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_7, stp1_5, stp1_6) \ 471ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } \ 472ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 473ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* Stage2 */ \ 474ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { \ 475ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ 476ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ 477ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ 478ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ 479ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 480ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \ 481ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg2_1, stg2_2, stg2_3, stp2_0, \ 482ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_1, stp2_2, stp2_3) \ 483ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 484ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ 485ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ 486ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ 487ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ 488ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } \ 489ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 490ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* Stage3 */ \ 491ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { \ 492ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 493ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 494ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 495ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ 496ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ 497ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ 498ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ 499ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 500ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ 501ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ 502ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ 503ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ 504ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 505ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_add_epi32(tmp0, rounding); \ 506ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_add_epi32(tmp1, rounding); \ 507ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_add_epi32(tmp2, rounding); \ 508ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_add_epi32(tmp3, rounding); \ 509ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 510ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 511ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 512ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 513ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 514ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 515ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 516ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 517ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } \ 518ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 519ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* Stage4 */ \ 520b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian out0 = _mm_adds_epi16(stp1_0, stp2_7); \ 521b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian out1 = _mm_adds_epi16(stp1_1, stp1_6); \ 522b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian out2 = _mm_adds_epi16(stp1_2, stp1_5); \ 523b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian out3 = _mm_adds_epi16(stp1_3, stp2_4); \ 524b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian out4 = _mm_subs_epi16(stp1_3, stp2_4); \ 525b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian out5 = _mm_subs_epi16(stp1_2, stp1_5); \ 526b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian out6 = _mm_subs_epi16(stp1_1, stp1_6); \ 527b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian out7 = _mm_subs_epi16(stp1_0, stp2_7); \ 528b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 529ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 530ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define RECON_AND_STORE(dest, in_x) \ 531ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { \ 532ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ 533ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang d0 = _mm_unpacklo_epi8(d0, zero); \ 534f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang d0 = _mm_add_epi16(in_x, d0); \ 535f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang d0 = _mm_packus_epi16(d0, d0); \ 536f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang _mm_storel_epi64((__m128i *)(dest), d0); \ 537ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang dest += stride; \ 538ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 539ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 5405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 541ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i zero = _mm_setzero_si128(); 542ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 543ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i final_rounding = _mm_set1_epi16(1<<4); 544ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 545ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 546ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 547ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 548ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 549ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 550ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 551ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 552ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 553ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i in0, in1, in2, in3, in4, in5, in6, in7; 554ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 555ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 556ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 557ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang int i; 558ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 559ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Load input data. 5605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in0 = _mm_load_si128((const __m128i *)input); 5615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); 5625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); 5635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); 5645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in4 = _mm_load_si128((const __m128i *)(input + 8 * 4)); 5655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in5 = _mm_load_si128((const __m128i *)(input + 8 * 5)); 5665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in6 = _mm_load_si128((const __m128i *)(input + 8 * 6)); 5675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in7 = _mm_load_si128((const __m128i *)(input + 8 * 7)); 568ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 569ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 2-D 570ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang for (i = 0; i < 2; i++) { 5715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang // 8x8 Transpose is copied from vp9_fdct8x8_sse2() 572b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, 573b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in0, in1, in2, in3, in4, in5, in6, in7); 574ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 575ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 4-stage 1D idct8x8 576b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, 577b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in0, in1, in2, in3, in4, in5, in6, in7); 578ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 579ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 580ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Final rounding and shift 581ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_adds_epi16(in0, final_rounding); 582ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_adds_epi16(in1, final_rounding); 583ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_adds_epi16(in2, final_rounding); 584ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_adds_epi16(in3, final_rounding); 585ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4 = _mm_adds_epi16(in4, final_rounding); 586ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in5 = _mm_adds_epi16(in5, final_rounding); 587ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in6 = _mm_adds_epi16(in6, final_rounding); 588ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in7 = _mm_adds_epi16(in7, final_rounding); 589ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 590ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_srai_epi16(in0, 5); 591ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_srai_epi16(in1, 5); 592ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_srai_epi16(in2, 5); 593ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_srai_epi16(in3, 5); 594ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4 = _mm_srai_epi16(in4, 5); 595ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in5 = _mm_srai_epi16(in5, 5); 596ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in6 = _mm_srai_epi16(in6, 5); 597ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in7 = _mm_srai_epi16(in7, 5); 598ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 599ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in0); 600ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in1); 601ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in2); 602ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in3); 603ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in4); 604ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in5); 605ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in6); 606ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in7); 607ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang} 608ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 6095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 610f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang __m128i dc_value; 611f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang const __m128i zero = _mm_setzero_si128(); 612f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang int a; 613f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang 614f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang a = dct_const_round_shift(input[0] * cospi_16_64); 615f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang a = dct_const_round_shift(a * cospi_16_64); 616f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang a = ROUND_POWER_OF_TWO(a, 5); 617f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang 618f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang dc_value = _mm_set1_epi16(a); 619f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang 620f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang RECON_AND_STORE(dest, dc_value); 621f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang RECON_AND_STORE(dest, dc_value); 622f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang RECON_AND_STORE(dest, dc_value); 623f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang RECON_AND_STORE(dest, dc_value); 624f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang RECON_AND_STORE(dest, dc_value); 625f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang RECON_AND_STORE(dest, dc_value); 626f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang RECON_AND_STORE(dest, dc_value); 627f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang RECON_AND_STORE(dest, dc_value); 628f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang} 629f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang 63091037db265ecdd914a26e056cf69207b4f50924ehkuang// perform 8x8 transpose 63191037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { 63291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); 63391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); 63491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); 63591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); 63691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); 63791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); 63891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); 63991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); 64091037db265ecdd914a26e056cf69207b4f50924ehkuang 64191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 64291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); 64391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 64491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); 64591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); 64691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 64791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); 64891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 64991037db265ecdd914a26e056cf69207b4f50924ehkuang 65091037db265ecdd914a26e056cf69207b4f50924ehkuang res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); 65191037db265ecdd914a26e056cf69207b4f50924ehkuang res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); 65291037db265ecdd914a26e056cf69207b4f50924ehkuang res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); 65391037db265ecdd914a26e056cf69207b4f50924ehkuang res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); 65491037db265ecdd914a26e056cf69207b4f50924ehkuang res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); 65591037db265ecdd914a26e056cf69207b4f50924ehkuang res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); 65691037db265ecdd914a26e056cf69207b4f50924ehkuang res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); 65791037db265ecdd914a26e056cf69207b4f50924ehkuang res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); 65891037db265ecdd914a26e056cf69207b4f50924ehkuang} 65991037db265ecdd914a26e056cf69207b4f50924ehkuang 660b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic INLINE void array_transpose_4X8(__m128i *in, __m128i * out) { 661b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); 662b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); 663b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); 664b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); 665b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 666b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 667b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 668b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); 669b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); 670b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 671b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4); 672b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4); 673b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6); 674b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6); 675b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 676b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 677b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void idct8_sse2(__m128i *in) { 67891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 67991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 68091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 68191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 68291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 68391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 68491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 68591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 68691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 68791037db265ecdd914a26e056cf69207b4f50924ehkuang 68891037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in0, in1, in2, in3, in4, in5, in6, in7; 68991037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 69091037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 69191037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 69291037db265ecdd914a26e056cf69207b4f50924ehkuang 6935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang // 8x8 Transpose is copied from vp9_fdct8x8_sse2() 694b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], 695b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in0, in1, in2, in3, in4, in5, in6, in7); 69691037db265ecdd914a26e056cf69207b4f50924ehkuang 69791037db265ecdd914a26e056cf69207b4f50924ehkuang // 4-stage 1D idct8x8 698b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, 699b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]); 70091037db265ecdd914a26e056cf69207b4f50924ehkuang} 70191037db265ecdd914a26e056cf69207b4f50924ehkuang 702b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void iadst8_sse2(__m128i *in) { 70391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); 70491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); 70591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); 70691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); 70791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); 70891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); 70991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); 71091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); 71191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 71291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 71391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 71491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 71591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 71691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__const_0 = _mm_set1_epi16(0); 71791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 71891037db265ecdd914a26e056cf69207b4f50924ehkuang 71991037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; 72091037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; 72191037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; 72291037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s0, s1, s2, s3, s4, s5, s6, s7; 72391037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in0, in1, in2, in3, in4, in5, in6, in7; 72491037db265ecdd914a26e056cf69207b4f50924ehkuang 72591037db265ecdd914a26e056cf69207b4f50924ehkuang // transpose 72691037db265ecdd914a26e056cf69207b4f50924ehkuang array_transpose_8x8(in, in); 72791037db265ecdd914a26e056cf69207b4f50924ehkuang 72891037db265ecdd914a26e056cf69207b4f50924ehkuang // properly aligned for butterfly input 72991037db265ecdd914a26e056cf69207b4f50924ehkuang in0 = in[7]; 73091037db265ecdd914a26e056cf69207b4f50924ehkuang in1 = in[0]; 73191037db265ecdd914a26e056cf69207b4f50924ehkuang in2 = in[5]; 73291037db265ecdd914a26e056cf69207b4f50924ehkuang in3 = in[2]; 73391037db265ecdd914a26e056cf69207b4f50924ehkuang in4 = in[3]; 73491037db265ecdd914a26e056cf69207b4f50924ehkuang in5 = in[4]; 73591037db265ecdd914a26e056cf69207b4f50924ehkuang in6 = in[1]; 73691037db265ecdd914a26e056cf69207b4f50924ehkuang in7 = in[6]; 73791037db265ecdd914a26e056cf69207b4f50924ehkuang 73891037db265ecdd914a26e056cf69207b4f50924ehkuang // column transformation 73991037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 1 74091037db265ecdd914a26e056cf69207b4f50924ehkuang // interleave and multiply/add into 32-bit integer 74191037db265ecdd914a26e056cf69207b4f50924ehkuang s0 = _mm_unpacklo_epi16(in0, in1); 74291037db265ecdd914a26e056cf69207b4f50924ehkuang s1 = _mm_unpackhi_epi16(in0, in1); 74391037db265ecdd914a26e056cf69207b4f50924ehkuang s2 = _mm_unpacklo_epi16(in2, in3); 74491037db265ecdd914a26e056cf69207b4f50924ehkuang s3 = _mm_unpackhi_epi16(in2, in3); 74591037db265ecdd914a26e056cf69207b4f50924ehkuang s4 = _mm_unpacklo_epi16(in4, in5); 74691037db265ecdd914a26e056cf69207b4f50924ehkuang s5 = _mm_unpackhi_epi16(in4, in5); 74791037db265ecdd914a26e056cf69207b4f50924ehkuang s6 = _mm_unpacklo_epi16(in6, in7); 74891037db265ecdd914a26e056cf69207b4f50924ehkuang s7 = _mm_unpackhi_epi16(in6, in7); 74991037db265ecdd914a26e056cf69207b4f50924ehkuang 75091037db265ecdd914a26e056cf69207b4f50924ehkuang u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); 75191037db265ecdd914a26e056cf69207b4f50924ehkuang u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); 75291037db265ecdd914a26e056cf69207b4f50924ehkuang u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); 75391037db265ecdd914a26e056cf69207b4f50924ehkuang u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); 75491037db265ecdd914a26e056cf69207b4f50924ehkuang u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); 75591037db265ecdd914a26e056cf69207b4f50924ehkuang u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); 75691037db265ecdd914a26e056cf69207b4f50924ehkuang u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); 75791037db265ecdd914a26e056cf69207b4f50924ehkuang u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); 75891037db265ecdd914a26e056cf69207b4f50924ehkuang u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); 75991037db265ecdd914a26e056cf69207b4f50924ehkuang u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); 76091037db265ecdd914a26e056cf69207b4f50924ehkuang u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); 76191037db265ecdd914a26e056cf69207b4f50924ehkuang u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); 76291037db265ecdd914a26e056cf69207b4f50924ehkuang u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); 76391037db265ecdd914a26e056cf69207b4f50924ehkuang u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); 76491037db265ecdd914a26e056cf69207b4f50924ehkuang u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); 76591037db265ecdd914a26e056cf69207b4f50924ehkuang u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); 76691037db265ecdd914a26e056cf69207b4f50924ehkuang 76791037db265ecdd914a26e056cf69207b4f50924ehkuang // addition 76891037db265ecdd914a26e056cf69207b4f50924ehkuang w0 = _mm_add_epi32(u0, u8); 76991037db265ecdd914a26e056cf69207b4f50924ehkuang w1 = _mm_add_epi32(u1, u9); 77091037db265ecdd914a26e056cf69207b4f50924ehkuang w2 = _mm_add_epi32(u2, u10); 77191037db265ecdd914a26e056cf69207b4f50924ehkuang w3 = _mm_add_epi32(u3, u11); 77291037db265ecdd914a26e056cf69207b4f50924ehkuang w4 = _mm_add_epi32(u4, u12); 77391037db265ecdd914a26e056cf69207b4f50924ehkuang w5 = _mm_add_epi32(u5, u13); 77491037db265ecdd914a26e056cf69207b4f50924ehkuang w6 = _mm_add_epi32(u6, u14); 77591037db265ecdd914a26e056cf69207b4f50924ehkuang w7 = _mm_add_epi32(u7, u15); 77691037db265ecdd914a26e056cf69207b4f50924ehkuang w8 = _mm_sub_epi32(u0, u8); 77791037db265ecdd914a26e056cf69207b4f50924ehkuang w9 = _mm_sub_epi32(u1, u9); 77891037db265ecdd914a26e056cf69207b4f50924ehkuang w10 = _mm_sub_epi32(u2, u10); 77991037db265ecdd914a26e056cf69207b4f50924ehkuang w11 = _mm_sub_epi32(u3, u11); 78091037db265ecdd914a26e056cf69207b4f50924ehkuang w12 = _mm_sub_epi32(u4, u12); 78191037db265ecdd914a26e056cf69207b4f50924ehkuang w13 = _mm_sub_epi32(u5, u13); 78291037db265ecdd914a26e056cf69207b4f50924ehkuang w14 = _mm_sub_epi32(u6, u14); 78391037db265ecdd914a26e056cf69207b4f50924ehkuang w15 = _mm_sub_epi32(u7, u15); 78491037db265ecdd914a26e056cf69207b4f50924ehkuang 78591037db265ecdd914a26e056cf69207b4f50924ehkuang // shift and rounding 78691037db265ecdd914a26e056cf69207b4f50924ehkuang v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 78791037db265ecdd914a26e056cf69207b4f50924ehkuang v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 78891037db265ecdd914a26e056cf69207b4f50924ehkuang v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 78991037db265ecdd914a26e056cf69207b4f50924ehkuang v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 79091037db265ecdd914a26e056cf69207b4f50924ehkuang v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 79191037db265ecdd914a26e056cf69207b4f50924ehkuang v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 79291037db265ecdd914a26e056cf69207b4f50924ehkuang v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 79391037db265ecdd914a26e056cf69207b4f50924ehkuang v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 79491037db265ecdd914a26e056cf69207b4f50924ehkuang v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); 79591037db265ecdd914a26e056cf69207b4f50924ehkuang v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); 79691037db265ecdd914a26e056cf69207b4f50924ehkuang v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); 79791037db265ecdd914a26e056cf69207b4f50924ehkuang v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); 79891037db265ecdd914a26e056cf69207b4f50924ehkuang v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); 79991037db265ecdd914a26e056cf69207b4f50924ehkuang v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); 80091037db265ecdd914a26e056cf69207b4f50924ehkuang v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); 80191037db265ecdd914a26e056cf69207b4f50924ehkuang v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); 80291037db265ecdd914a26e056cf69207b4f50924ehkuang 80391037db265ecdd914a26e056cf69207b4f50924ehkuang u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 80491037db265ecdd914a26e056cf69207b4f50924ehkuang u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 80591037db265ecdd914a26e056cf69207b4f50924ehkuang u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 80691037db265ecdd914a26e056cf69207b4f50924ehkuang u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 80791037db265ecdd914a26e056cf69207b4f50924ehkuang u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 80891037db265ecdd914a26e056cf69207b4f50924ehkuang u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 80991037db265ecdd914a26e056cf69207b4f50924ehkuang u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 81091037db265ecdd914a26e056cf69207b4f50924ehkuang u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 81191037db265ecdd914a26e056cf69207b4f50924ehkuang u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); 81291037db265ecdd914a26e056cf69207b4f50924ehkuang u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); 81391037db265ecdd914a26e056cf69207b4f50924ehkuang u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); 81491037db265ecdd914a26e056cf69207b4f50924ehkuang u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); 81591037db265ecdd914a26e056cf69207b4f50924ehkuang u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); 81691037db265ecdd914a26e056cf69207b4f50924ehkuang u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); 81791037db265ecdd914a26e056cf69207b4f50924ehkuang u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); 81891037db265ecdd914a26e056cf69207b4f50924ehkuang u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); 81991037db265ecdd914a26e056cf69207b4f50924ehkuang 82091037db265ecdd914a26e056cf69207b4f50924ehkuang // back to 16-bit and pack 8 integers into __m128i 82191037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_packs_epi32(u0, u1); 82291037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_packs_epi32(u2, u3); 82391037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = _mm_packs_epi32(u4, u5); 82491037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_packs_epi32(u6, u7); 82591037db265ecdd914a26e056cf69207b4f50924ehkuang in[4] = _mm_packs_epi32(u8, u9); 82691037db265ecdd914a26e056cf69207b4f50924ehkuang in[5] = _mm_packs_epi32(u10, u11); 82791037db265ecdd914a26e056cf69207b4f50924ehkuang in[6] = _mm_packs_epi32(u12, u13); 82891037db265ecdd914a26e056cf69207b4f50924ehkuang in[7] = _mm_packs_epi32(u14, u15); 82991037db265ecdd914a26e056cf69207b4f50924ehkuang 83091037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 2 83191037db265ecdd914a26e056cf69207b4f50924ehkuang s0 = _mm_add_epi16(in[0], in[2]); 83291037db265ecdd914a26e056cf69207b4f50924ehkuang s1 = _mm_add_epi16(in[1], in[3]); 83391037db265ecdd914a26e056cf69207b4f50924ehkuang s2 = _mm_sub_epi16(in[0], in[2]); 83491037db265ecdd914a26e056cf69207b4f50924ehkuang s3 = _mm_sub_epi16(in[1], in[3]); 83591037db265ecdd914a26e056cf69207b4f50924ehkuang u0 = _mm_unpacklo_epi16(in[4], in[5]); 83691037db265ecdd914a26e056cf69207b4f50924ehkuang u1 = _mm_unpackhi_epi16(in[4], in[5]); 83791037db265ecdd914a26e056cf69207b4f50924ehkuang u2 = _mm_unpacklo_epi16(in[6], in[7]); 83891037db265ecdd914a26e056cf69207b4f50924ehkuang u3 = _mm_unpackhi_epi16(in[6], in[7]); 83991037db265ecdd914a26e056cf69207b4f50924ehkuang 84091037db265ecdd914a26e056cf69207b4f50924ehkuang v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); 84191037db265ecdd914a26e056cf69207b4f50924ehkuang v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); 84291037db265ecdd914a26e056cf69207b4f50924ehkuang v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); 84391037db265ecdd914a26e056cf69207b4f50924ehkuang v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); 84491037db265ecdd914a26e056cf69207b4f50924ehkuang v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); 84591037db265ecdd914a26e056cf69207b4f50924ehkuang v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); 84691037db265ecdd914a26e056cf69207b4f50924ehkuang v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); 84791037db265ecdd914a26e056cf69207b4f50924ehkuang v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); 84891037db265ecdd914a26e056cf69207b4f50924ehkuang 84991037db265ecdd914a26e056cf69207b4f50924ehkuang w0 = _mm_add_epi32(v0, v4); 85091037db265ecdd914a26e056cf69207b4f50924ehkuang w1 = _mm_add_epi32(v1, v5); 85191037db265ecdd914a26e056cf69207b4f50924ehkuang w2 = _mm_add_epi32(v2, v6); 85291037db265ecdd914a26e056cf69207b4f50924ehkuang w3 = _mm_add_epi32(v3, v7); 85391037db265ecdd914a26e056cf69207b4f50924ehkuang w4 = _mm_sub_epi32(v0, v4); 85491037db265ecdd914a26e056cf69207b4f50924ehkuang w5 = _mm_sub_epi32(v1, v5); 85591037db265ecdd914a26e056cf69207b4f50924ehkuang w6 = _mm_sub_epi32(v2, v6); 85691037db265ecdd914a26e056cf69207b4f50924ehkuang w7 = _mm_sub_epi32(v3, v7); 85791037db265ecdd914a26e056cf69207b4f50924ehkuang 85891037db265ecdd914a26e056cf69207b4f50924ehkuang v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 85991037db265ecdd914a26e056cf69207b4f50924ehkuang v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 86091037db265ecdd914a26e056cf69207b4f50924ehkuang v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 86191037db265ecdd914a26e056cf69207b4f50924ehkuang v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 86291037db265ecdd914a26e056cf69207b4f50924ehkuang v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 86391037db265ecdd914a26e056cf69207b4f50924ehkuang v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 86491037db265ecdd914a26e056cf69207b4f50924ehkuang v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 86591037db265ecdd914a26e056cf69207b4f50924ehkuang v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 86691037db265ecdd914a26e056cf69207b4f50924ehkuang 86791037db265ecdd914a26e056cf69207b4f50924ehkuang u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 86891037db265ecdd914a26e056cf69207b4f50924ehkuang u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 86991037db265ecdd914a26e056cf69207b4f50924ehkuang u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 87091037db265ecdd914a26e056cf69207b4f50924ehkuang u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 87191037db265ecdd914a26e056cf69207b4f50924ehkuang u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 87291037db265ecdd914a26e056cf69207b4f50924ehkuang u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 87391037db265ecdd914a26e056cf69207b4f50924ehkuang u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 87491037db265ecdd914a26e056cf69207b4f50924ehkuang u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 87591037db265ecdd914a26e056cf69207b4f50924ehkuang 87691037db265ecdd914a26e056cf69207b4f50924ehkuang // back to 16-bit intergers 87791037db265ecdd914a26e056cf69207b4f50924ehkuang s4 = _mm_packs_epi32(u0, u1); 87891037db265ecdd914a26e056cf69207b4f50924ehkuang s5 = _mm_packs_epi32(u2, u3); 87991037db265ecdd914a26e056cf69207b4f50924ehkuang s6 = _mm_packs_epi32(u4, u5); 88091037db265ecdd914a26e056cf69207b4f50924ehkuang s7 = _mm_packs_epi32(u6, u7); 88191037db265ecdd914a26e056cf69207b4f50924ehkuang 88291037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 3 88391037db265ecdd914a26e056cf69207b4f50924ehkuang u0 = _mm_unpacklo_epi16(s2, s3); 88491037db265ecdd914a26e056cf69207b4f50924ehkuang u1 = _mm_unpackhi_epi16(s2, s3); 88591037db265ecdd914a26e056cf69207b4f50924ehkuang u2 = _mm_unpacklo_epi16(s6, s7); 88691037db265ecdd914a26e056cf69207b4f50924ehkuang u3 = _mm_unpackhi_epi16(s6, s7); 88791037db265ecdd914a26e056cf69207b4f50924ehkuang 88891037db265ecdd914a26e056cf69207b4f50924ehkuang v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); 88991037db265ecdd914a26e056cf69207b4f50924ehkuang v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); 89091037db265ecdd914a26e056cf69207b4f50924ehkuang v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); 89191037db265ecdd914a26e056cf69207b4f50924ehkuang v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); 89291037db265ecdd914a26e056cf69207b4f50924ehkuang v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); 89391037db265ecdd914a26e056cf69207b4f50924ehkuang v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); 89491037db265ecdd914a26e056cf69207b4f50924ehkuang v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); 89591037db265ecdd914a26e056cf69207b4f50924ehkuang v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); 89691037db265ecdd914a26e056cf69207b4f50924ehkuang 89791037db265ecdd914a26e056cf69207b4f50924ehkuang u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 89891037db265ecdd914a26e056cf69207b4f50924ehkuang u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 89991037db265ecdd914a26e056cf69207b4f50924ehkuang u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 90091037db265ecdd914a26e056cf69207b4f50924ehkuang u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 90191037db265ecdd914a26e056cf69207b4f50924ehkuang u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); 90291037db265ecdd914a26e056cf69207b4f50924ehkuang u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); 90391037db265ecdd914a26e056cf69207b4f50924ehkuang u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); 90491037db265ecdd914a26e056cf69207b4f50924ehkuang u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); 90591037db265ecdd914a26e056cf69207b4f50924ehkuang 90691037db265ecdd914a26e056cf69207b4f50924ehkuang v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 90791037db265ecdd914a26e056cf69207b4f50924ehkuang v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 90891037db265ecdd914a26e056cf69207b4f50924ehkuang v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 90991037db265ecdd914a26e056cf69207b4f50924ehkuang v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 91091037db265ecdd914a26e056cf69207b4f50924ehkuang v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); 91191037db265ecdd914a26e056cf69207b4f50924ehkuang v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); 91291037db265ecdd914a26e056cf69207b4f50924ehkuang v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); 91391037db265ecdd914a26e056cf69207b4f50924ehkuang v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); 91491037db265ecdd914a26e056cf69207b4f50924ehkuang 91591037db265ecdd914a26e056cf69207b4f50924ehkuang s2 = _mm_packs_epi32(v0, v1); 91691037db265ecdd914a26e056cf69207b4f50924ehkuang s3 = _mm_packs_epi32(v2, v3); 91791037db265ecdd914a26e056cf69207b4f50924ehkuang s6 = _mm_packs_epi32(v4, v5); 91891037db265ecdd914a26e056cf69207b4f50924ehkuang s7 = _mm_packs_epi32(v6, v7); 91991037db265ecdd914a26e056cf69207b4f50924ehkuang 92091037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = s0; 92191037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_sub_epi16(k__const_0, s4); 92291037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = s6; 92391037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_sub_epi16(k__const_0, s2); 92491037db265ecdd914a26e056cf69207b4f50924ehkuang in[4] = s3; 92591037db265ecdd914a26e056cf69207b4f50924ehkuang in[5] = _mm_sub_epi16(k__const_0, s7); 92691037db265ecdd914a26e056cf69207b4f50924ehkuang in[6] = s5; 92791037db265ecdd914a26e056cf69207b4f50924ehkuang in[7] = _mm_sub_epi16(k__const_0, s1); 92891037db265ecdd914a26e056cf69207b4f50924ehkuang} 92991037db265ecdd914a26e056cf69207b4f50924ehkuang 93091037db265ecdd914a26e056cf69207b4f50924ehkuang 9315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, 9325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int tx_type) { 93391037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in[8]; 93491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i zero = _mm_setzero_si128(); 93591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i final_rounding = _mm_set1_epi16(1<<4); 93691037db265ecdd914a26e056cf69207b4f50924ehkuang 93791037db265ecdd914a26e056cf69207b4f50924ehkuang // load input data 9385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in[0] = _mm_load_si128((const __m128i *)input); 9395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1)); 9405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2)); 9415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3)); 9425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4)); 9435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5)); 9445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6)); 9455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7)); 94691037db265ecdd914a26e056cf69207b4f50924ehkuang 94791037db265ecdd914a26e056cf69207b4f50924ehkuang switch (tx_type) { 94891037db265ecdd914a26e056cf69207b4f50924ehkuang case 0: // DCT_DCT 949b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian idct8_sse2(in); 950b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian idct8_sse2(in); 95191037db265ecdd914a26e056cf69207b4f50924ehkuang break; 95291037db265ecdd914a26e056cf69207b4f50924ehkuang case 1: // ADST_DCT 953b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian idct8_sse2(in); 954b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian iadst8_sse2(in); 95591037db265ecdd914a26e056cf69207b4f50924ehkuang break; 95691037db265ecdd914a26e056cf69207b4f50924ehkuang case 2: // DCT_ADST 957b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian iadst8_sse2(in); 958b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian idct8_sse2(in); 95991037db265ecdd914a26e056cf69207b4f50924ehkuang break; 96091037db265ecdd914a26e056cf69207b4f50924ehkuang case 3: // ADST_ADST 961b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian iadst8_sse2(in); 962b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian iadst8_sse2(in); 96391037db265ecdd914a26e056cf69207b4f50924ehkuang break; 96491037db265ecdd914a26e056cf69207b4f50924ehkuang default: 96591037db265ecdd914a26e056cf69207b4f50924ehkuang assert(0); 96691037db265ecdd914a26e056cf69207b4f50924ehkuang break; 96791037db265ecdd914a26e056cf69207b4f50924ehkuang } 96891037db265ecdd914a26e056cf69207b4f50924ehkuang 96991037db265ecdd914a26e056cf69207b4f50924ehkuang // Final rounding and shift 97091037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_adds_epi16(in[0], final_rounding); 97191037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_adds_epi16(in[1], final_rounding); 97291037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = _mm_adds_epi16(in[2], final_rounding); 97391037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_adds_epi16(in[3], final_rounding); 97491037db265ecdd914a26e056cf69207b4f50924ehkuang in[4] = _mm_adds_epi16(in[4], final_rounding); 97591037db265ecdd914a26e056cf69207b4f50924ehkuang in[5] = _mm_adds_epi16(in[5], final_rounding); 97691037db265ecdd914a26e056cf69207b4f50924ehkuang in[6] = _mm_adds_epi16(in[6], final_rounding); 97791037db265ecdd914a26e056cf69207b4f50924ehkuang in[7] = _mm_adds_epi16(in[7], final_rounding); 97891037db265ecdd914a26e056cf69207b4f50924ehkuang 97991037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_srai_epi16(in[0], 5); 98091037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_srai_epi16(in[1], 5); 98191037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = _mm_srai_epi16(in[2], 5); 98291037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_srai_epi16(in[3], 5); 98391037db265ecdd914a26e056cf69207b4f50924ehkuang in[4] = _mm_srai_epi16(in[4], 5); 98491037db265ecdd914a26e056cf69207b4f50924ehkuang in[5] = _mm_srai_epi16(in[5], 5); 98591037db265ecdd914a26e056cf69207b4f50924ehkuang in[6] = _mm_srai_epi16(in[6], 5); 98691037db265ecdd914a26e056cf69207b4f50924ehkuang in[7] = _mm_srai_epi16(in[7], 5); 98791037db265ecdd914a26e056cf69207b4f50924ehkuang 98891037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[0]); 98991037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[1]); 99091037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[2]); 99191037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[3]); 99291037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[4]); 99391037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[5]); 99491037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[6]); 99591037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[7]); 99691037db265ecdd914a26e056cf69207b4f50924ehkuang} 99791037db265ecdd914a26e056cf69207b4f50924ehkuang 9985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 999ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i zero = _mm_setzero_si128(); 1000ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 1001ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i final_rounding = _mm_set1_epi16(1<<4); 1002ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1003ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 1004ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 1005ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 1006ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 1007ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1008ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1009ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 1010ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1011ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1012ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i in0, in1, in2, in3, in4, in5, in6, in7; 1013ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 1014ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 1015ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 1016ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1017ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Rows. Load 4-row input data. 10185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in0 = _mm_load_si128((const __m128i *)input); 10195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); 10205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); 10215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); 1022ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1023ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 8x4 Transpose 1024b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); 1025ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage1 10265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang { //NOLINT 1027b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero); 1028b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero); 1029ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1030ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_madd_epi16(lo_17, stg1_0); 1031ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_madd_epi16(lo_17, stg1_1); 1032ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_madd_epi16(lo_35, stg1_2); 1033ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_madd_epi16(lo_35, stg1_3); 1034ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1035ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_add_epi32(tmp0, rounding); 1036ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_add_epi32(tmp2, rounding); 1037ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_add_epi32(tmp4, rounding); 1038ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_add_epi32(tmp6, rounding); 1039ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 1040ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 1041ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 1042ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 1043ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1044b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_4 = _mm_packs_epi32(tmp0, tmp2); 1045b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_5 = _mm_packs_epi32(tmp4, tmp6); 1046ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1047ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1048ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage2 10495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang { //NOLINT 1050b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero); 1051b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero); 1052ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1053ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_madd_epi16(lo_04, stg2_0); 1054ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_madd_epi16(lo_04, stg2_1); 1055ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_madd_epi16(lo_26, stg2_2); 1056ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_madd_epi16(lo_26, stg2_3); 1057ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1058ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_add_epi32(tmp0, rounding); 1059ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_add_epi32(tmp2, rounding); 1060ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_add_epi32(tmp4, rounding); 1061ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_add_epi32(tmp6, rounding); 1062ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 1063ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 1064ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 1065ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 1066ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1067b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_0 = _mm_packs_epi32(tmp0, tmp2); 1068b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_2 = _mm_packs_epi32(tmp6, tmp4); 1069ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1070b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp0 = _mm_adds_epi16(stp1_4, stp1_5); 1071b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp1 = _mm_subs_epi16(stp1_4, stp1_5); 1072b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1073b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_4 = tmp0; 1074b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_5 = _mm_unpacklo_epi64(tmp1, zero); 1075b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_6 = _mm_unpackhi_epi64(tmp1, zero); 1076ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1077ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1078ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage3 10795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang { //NOLINT 1080ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); 1081b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1082b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp4 = _mm_adds_epi16(stp2_0, stp2_2); 1083b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp6 = _mm_subs_epi16(stp2_0, stp2_2); 1084b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1085b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4); 1086b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4); 1087ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1088ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_madd_epi16(lo_56, stg3_0); 1089ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 1090ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1091ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_add_epi32(tmp0, rounding); 1092ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_add_epi32(tmp2, rounding); 1093ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 1094ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 1095ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1096b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_5 = _mm_packs_epi32(tmp0, tmp2); 1097ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1098ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1099ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage4 1100b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp0 = _mm_adds_epi16(stp1_3, stp2_4); 1101b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp1 = _mm_adds_epi16(stp1_2, stp1_5); 1102b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp2 = _mm_subs_epi16(stp1_3, stp2_4); 1103b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp3 = _mm_subs_epi16(stp1_2, stp1_5); 1104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1105b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) 1106b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1107b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, 1108b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in0, in1, in2, in3, in4, in5, in6, in7); 1109ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Final rounding and shift 1110ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_adds_epi16(in0, final_rounding); 1111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_adds_epi16(in1, final_rounding); 1112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_adds_epi16(in2, final_rounding); 1113ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_adds_epi16(in3, final_rounding); 1114ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4 = _mm_adds_epi16(in4, final_rounding); 1115ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in5 = _mm_adds_epi16(in5, final_rounding); 1116ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in6 = _mm_adds_epi16(in6, final_rounding); 1117ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in7 = _mm_adds_epi16(in7, final_rounding); 1118ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1119ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_srai_epi16(in0, 5); 1120ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_srai_epi16(in1, 5); 1121ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_srai_epi16(in2, 5); 1122ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_srai_epi16(in3, 5); 1123ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4 = _mm_srai_epi16(in4, 5); 1124ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in5 = _mm_srai_epi16(in5, 5); 1125ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in6 = _mm_srai_epi16(in6, 5); 1126ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in7 = _mm_srai_epi16(in7, 5); 1127ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1128ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in0); 1129ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in1); 1130ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in2); 1131ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in3); 1132ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in4); 1133ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in5); 1134ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in6); 1135ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in7); 1136ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang} 1137ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1138b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#define IDCT16 \ 1139ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* Stage2 */ \ 1140ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { \ 1141b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \ 1142b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \ 1143b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \ 1144b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \ 1145b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \ 1146b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \ 1147b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \ 1148b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \ 1149ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1150ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \ 1151ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg2_0, stg2_1, stg2_2, stg2_3, \ 1152ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_8, stp2_15, stp2_9, stp2_14) \ 1153ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1154ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \ 1155ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg2_4, stg2_5, stg2_6, stg2_7, \ 1156ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_10, stp2_13, stp2_11, stp2_12) \ 1157ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } \ 1158ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1159ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* Stage3 */ \ 1160ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { \ 1161b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \ 1162b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \ 1163b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \ 1164b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \ 1165ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1166ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \ 1167ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg3_0, stg3_1, stg3_2, stg3_3, \ 1168ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_4, stp1_7, stp1_5, stp1_6) \ 1169ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1170ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ 1171ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ 1172ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ 1173ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ 1174ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1175ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ 1176ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ 1177ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ 1178ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ 1179ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } \ 1180ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1181ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* Stage4 */ \ 1182ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { \ 1183b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \ 1184b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \ 1185b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \ 1186b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \ 1187ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1188ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ 1189ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ 1190ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 1191ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 1192ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1193ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \ 1194ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg4_0, stg4_1, stg4_2, stg4_3, \ 1195ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_0, stp2_1, stp2_2, stp2_3) \ 1196ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1197ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ 1198ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ 1199ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ 1200ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ 1201ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1202ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ 1203ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg4_4, stg4_5, stg4_6, stg4_7, \ 1204ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_9, stp2_14, stp2_10, stp2_13) \ 1205ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } \ 1206ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1207ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* Stage5 */ \ 1208ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { \ 1209ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 1210ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 1211ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1212ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ 1213ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ 1214ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ 1215ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ 1216ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1217ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 1218ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 1219ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 1220ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 1221ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1222ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_add_epi32(tmp0, rounding); \ 1223ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_add_epi32(tmp1, rounding); \ 1224ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_add_epi32(tmp2, rounding); \ 1225ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_add_epi32(tmp3, rounding); \ 1226ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1227ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 1228ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 1229ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 1230ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 1231ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1232ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 1233ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 1234ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1235ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ 1236ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 1237ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 1238ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ 1239ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1240ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ 1241ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 1242ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 1243ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ 1244ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } \ 1245ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1246ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* Stage6 */ \ 1247ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { \ 1248ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 1249ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 1250ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 1251ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 1252ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1253ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ 1254ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 1255ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 1256ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ 1257ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ 1258ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 1259ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 1260ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ 1261ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1262ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ 1263ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg6_0, stg4_0, stg6_0, stg4_0, \ 1264ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_10, stp2_13, stp2_11, stp2_12) \ 1265ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1266ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1267b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#define IDCT16_10 \ 1268b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian /* Stage2 */ \ 1269b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { \ 1270b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \ 1271b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \ 1272b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \ 1273b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \ 1274b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 1275b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \ 1276b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stg2_0, stg2_1, stg2_6, stg2_7, \ 1277b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_8_0, stp1_15, stp1_11, stp1_12_0) \ 1278b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } \ 1279b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 1280b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian /* Stage3 */ \ 1281b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { \ 1282b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \ 1283b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \ 1284b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 1285b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \ 1286b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stg3_0, stg3_1, \ 1287b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_4, stp2_7) \ 1288b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 1289b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_9 = stp1_8_0; \ 1290b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_10 = stp1_11; \ 1291b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 1292b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_13 = stp1_12_0; \ 1293b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_14 = stp1_15; \ 1294b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } \ 1295b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 1296b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian /* Stage4 */ \ 1297b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { \ 1298b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \ 1299b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \ 1300b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 1301b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ 1302b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ 1303b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 1304b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 1305b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 1306b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \ 1307b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stg4_0, stg4_1, \ 1308b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_0, stp1_1) \ 1309b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_5 = stp2_4; \ 1310b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_6 = stp2_7; \ 1311b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 1312b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ 1313b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stg4_4, stg4_5, stg4_6, stg4_7, \ 1314b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_9, stp2_14, stp2_10, stp2_13) \ 1315b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } \ 1316b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 1317b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian /* Stage5 */ \ 1318b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { \ 1319b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 1320b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 1321b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 1322b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_2 = stp1_1; \ 1323b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_3 = stp1_0; \ 1324b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 1325b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 1326b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 1327b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 1328b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 1329b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 1330b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp0 = _mm_add_epi32(tmp0, rounding); \ 1331b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp1 = _mm_add_epi32(tmp1, rounding); \ 1332b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp2 = _mm_add_epi32(tmp2, rounding); \ 1333b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp3 = _mm_add_epi32(tmp3, rounding); \ 1334b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 1335b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 1336b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 1337b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 1338b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 1339b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 1340b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 1341b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 1342b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 1343b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ 1344b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 1345b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 1346b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ 1347b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 1348b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ 1349b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 1350b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 1351b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ 1352b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } \ 1353b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 1354b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian /* Stage6 */ \ 1355b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { \ 1356b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 1357b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 1358b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 1359b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 1360b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 1361b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ 1362b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 1363b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 1364b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ 1365b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ 1366b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 1367b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 1368b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ 1369b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 1370b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ 1371b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stg6_0, stg4_0, stg6_0, stg4_0, \ 1372b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_10, stp2_13, stp2_11, stp2_12) \ 1373b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 1374b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 13755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, 13765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int stride) { 1377ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 1378ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i final_rounding = _mm_set1_epi16(1<<5); 1379ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i zero = _mm_setzero_si128(); 1380ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1381ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 1382ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 1383ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 1384ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 1385ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 1386ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 1387ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 1388ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 1389ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1390ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1391ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 1392ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1393ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 1394ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1395ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 1396ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1397ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1398ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 1399ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1400ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 1401ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1402ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1403ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1404ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1405ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1406b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i in[16], l[16], r[16], *curr1; 1407ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 1408ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 1409ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_8_0, stp1_12_0; 1410ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 1411ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; 1412ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 1413ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang int i; 1414ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1415b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian curr1 = l; 1416b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian for (i = 0; i < 2; i++) { 1417b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 1-D idct 1418ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1419ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Load input data. 1420b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[0] = _mm_load_si128((const __m128i *)input); 1421b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1)); 1422b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); 1423b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3)); 1424b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); 1425b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5)); 1426b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); 1427b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7)); 1428b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8)); 1429b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9)); 1430b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10)); 1431b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11)); 1432b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12)); 1433b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13)); 1434b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14)); 1435b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15)); 1436b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1437b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian array_transpose_8x8(in, in); 1438b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian array_transpose_8x8(in+8, in+8); 1439b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1440b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian IDCT16 1441b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1442b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Stage7 1443b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian curr1[0] = _mm_add_epi16(stp2_0, stp1_15); 1444b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian curr1[1] = _mm_add_epi16(stp2_1, stp1_14); 1445b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian curr1[2] = _mm_add_epi16(stp2_2, stp2_13); 1446b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian curr1[3] = _mm_add_epi16(stp2_3, stp2_12); 1447b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian curr1[4] = _mm_add_epi16(stp2_4, stp2_11); 1448b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian curr1[5] = _mm_add_epi16(stp2_5, stp2_10); 1449b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian curr1[6] = _mm_add_epi16(stp2_6, stp1_9); 1450b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian curr1[7] = _mm_add_epi16(stp2_7, stp1_8); 1451b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian curr1[8] = _mm_sub_epi16(stp2_7, stp1_8); 1452b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian curr1[9] = _mm_sub_epi16(stp2_6, stp1_9); 1453b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian curr1[10] = _mm_sub_epi16(stp2_5, stp2_10); 1454b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian curr1[11] = _mm_sub_epi16(stp2_4, stp2_11); 1455b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian curr1[12] = _mm_sub_epi16(stp2_3, stp2_12); 1456b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian curr1[13] = _mm_sub_epi16(stp2_2, stp2_13); 1457b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian curr1[14] = _mm_sub_epi16(stp2_1, stp1_14); 1458b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian curr1[15] = _mm_sub_epi16(stp2_0, stp1_15); 1459b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1460b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian curr1 = r; 1461b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian input += 128; 1462b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 1463b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian for (i = 0; i < 2; i++) { 1464b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 1-D idct 1465b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian array_transpose_8x8(l+i*8, in); 1466b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian array_transpose_8x8(r+i*8, in+8); 1467ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1468b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian IDCT16 1469ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1470ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 2-D 1471b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[0] = _mm_add_epi16(stp2_0, stp1_15); 1472b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[1] = _mm_add_epi16(stp2_1, stp1_14); 1473b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[2] = _mm_add_epi16(stp2_2, stp2_13); 1474b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[3] = _mm_add_epi16(stp2_3, stp2_12); 1475b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[4] = _mm_add_epi16(stp2_4, stp2_11); 1476b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[5] = _mm_add_epi16(stp2_5, stp2_10); 1477b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[6] = _mm_add_epi16(stp2_6, stp1_9); 1478b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[7] = _mm_add_epi16(stp2_7, stp1_8); 1479b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[8] = _mm_sub_epi16(stp2_7, stp1_8); 1480b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[9] = _mm_sub_epi16(stp2_6, stp1_9); 1481b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[10] = _mm_sub_epi16(stp2_5, stp2_10); 1482b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[11] = _mm_sub_epi16(stp2_4, stp2_11); 1483b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[12] = _mm_sub_epi16(stp2_3, stp2_12); 1484b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[13] = _mm_sub_epi16(stp2_2, stp2_13); 1485b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[14] = _mm_sub_epi16(stp2_1, stp1_14); 1486b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[15] = _mm_sub_epi16(stp2_0, stp1_15); 1487ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1488ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Final rounding and shift 1489b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[0] = _mm_adds_epi16(in[0], final_rounding); 1490b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[1] = _mm_adds_epi16(in[1], final_rounding); 1491b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[2] = _mm_adds_epi16(in[2], final_rounding); 1492b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[3] = _mm_adds_epi16(in[3], final_rounding); 1493b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[4] = _mm_adds_epi16(in[4], final_rounding); 1494b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[5] = _mm_adds_epi16(in[5], final_rounding); 1495b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[6] = _mm_adds_epi16(in[6], final_rounding); 1496b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[7] = _mm_adds_epi16(in[7], final_rounding); 1497b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[8] = _mm_adds_epi16(in[8], final_rounding); 1498b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[9] = _mm_adds_epi16(in[9], final_rounding); 1499b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[10] = _mm_adds_epi16(in[10], final_rounding); 1500b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[11] = _mm_adds_epi16(in[11], final_rounding); 1501b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[12] = _mm_adds_epi16(in[12], final_rounding); 1502b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[13] = _mm_adds_epi16(in[13], final_rounding); 1503b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[14] = _mm_adds_epi16(in[14], final_rounding); 1504b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[15] = _mm_adds_epi16(in[15], final_rounding); 1505b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1506b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[0] = _mm_srai_epi16(in[0], 6); 1507b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[1] = _mm_srai_epi16(in[1], 6); 1508b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[2] = _mm_srai_epi16(in[2], 6); 1509b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[3] = _mm_srai_epi16(in[3], 6); 1510b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[4] = _mm_srai_epi16(in[4], 6); 1511b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[5] = _mm_srai_epi16(in[5], 6); 1512b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[6] = _mm_srai_epi16(in[6], 6); 1513b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[7] = _mm_srai_epi16(in[7], 6); 1514b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[8] = _mm_srai_epi16(in[8], 6); 1515b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[9] = _mm_srai_epi16(in[9], 6); 1516b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[10] = _mm_srai_epi16(in[10], 6); 1517b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[11] = _mm_srai_epi16(in[11], 6); 1518b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[12] = _mm_srai_epi16(in[12], 6); 1519b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[13] = _mm_srai_epi16(in[13], 6); 1520b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[14] = _mm_srai_epi16(in[14], 6); 1521b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[15] = _mm_srai_epi16(in[15], 6); 1522b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1523b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[0]); 1524b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[1]); 1525b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[2]); 1526b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[3]); 1527b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[4]); 1528b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[5]); 1529b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[6]); 1530b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[7]); 1531b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[8]); 1532b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[9]); 1533b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[10]); 1534b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[11]); 1535b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[12]); 1536b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[13]); 1537b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[14]); 1538b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[15]); 1539ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1540ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang dest += 8 - (stride * 16); 1541ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1542ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang} 1543ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 15445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 1545f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang __m128i dc_value; 1546f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang const __m128i zero = _mm_setzero_si128(); 1547f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang int a, i; 1548f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang 1549f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang a = dct_const_round_shift(input[0] * cospi_16_64); 1550f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang a = dct_const_round_shift(a * cospi_16_64); 1551f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang a = ROUND_POWER_OF_TWO(a, 6); 1552f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang 1553f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang dc_value = _mm_set1_epi16(a); 1554f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang 1555f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang for (i = 0; i < 2; ++i) { 1556f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang RECON_AND_STORE(dest, dc_value); 1557f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang RECON_AND_STORE(dest, dc_value); 1558f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang RECON_AND_STORE(dest, dc_value); 1559f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang RECON_AND_STORE(dest, dc_value); 1560f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang RECON_AND_STORE(dest, dc_value); 1561f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang RECON_AND_STORE(dest, dc_value); 1562f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang RECON_AND_STORE(dest, dc_value); 1563f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang RECON_AND_STORE(dest, dc_value); 1564f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang RECON_AND_STORE(dest, dc_value); 1565f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang RECON_AND_STORE(dest, dc_value); 1566f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang RECON_AND_STORE(dest, dc_value); 1567f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang RECON_AND_STORE(dest, dc_value); 1568f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang RECON_AND_STORE(dest, dc_value); 1569f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang RECON_AND_STORE(dest, dc_value); 1570f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang RECON_AND_STORE(dest, dc_value); 1571f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang RECON_AND_STORE(dest, dc_value); 1572f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang dest += 8 - (stride * 16); 1573f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang } 1574f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang} 1575f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang 157691037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { 157791037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i tbuf[8]; 157891037db265ecdd914a26e056cf69207b4f50924ehkuang array_transpose_8x8(res0, res0); 157991037db265ecdd914a26e056cf69207b4f50924ehkuang array_transpose_8x8(res1, tbuf); 158091037db265ecdd914a26e056cf69207b4f50924ehkuang array_transpose_8x8(res0 + 8, res1); 158191037db265ecdd914a26e056cf69207b4f50924ehkuang array_transpose_8x8(res1 + 8, res1 + 8); 158291037db265ecdd914a26e056cf69207b4f50924ehkuang 158391037db265ecdd914a26e056cf69207b4f50924ehkuang res0[8] = tbuf[0]; 158491037db265ecdd914a26e056cf69207b4f50924ehkuang res0[9] = tbuf[1]; 158591037db265ecdd914a26e056cf69207b4f50924ehkuang res0[10] = tbuf[2]; 158691037db265ecdd914a26e056cf69207b4f50924ehkuang res0[11] = tbuf[3]; 158791037db265ecdd914a26e056cf69207b4f50924ehkuang res0[12] = tbuf[4]; 158891037db265ecdd914a26e056cf69207b4f50924ehkuang res0[13] = tbuf[5]; 158991037db265ecdd914a26e056cf69207b4f50924ehkuang res0[14] = tbuf[6]; 159091037db265ecdd914a26e056cf69207b4f50924ehkuang res0[15] = tbuf[7]; 159191037db265ecdd914a26e056cf69207b4f50924ehkuang} 159291037db265ecdd914a26e056cf69207b4f50924ehkuang 1593b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void iadst16_8col(__m128i *in) { 159491037db265ecdd914a26e056cf69207b4f50924ehkuang // perform 16x16 1-D ADST for 8 columns 159591037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s[16], x[16], u[32], v[32]; 159691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); 159791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); 159891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); 159991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); 160091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); 160191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); 160291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); 160391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); 160491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); 160591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); 160691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); 160791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); 160891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); 160991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); 161091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); 161191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); 161291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); 161391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); 161491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); 161591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); 161691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); 161791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); 161891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 161991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 162091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 162191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); 162291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 162391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 162491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 162591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 162691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i kZero = _mm_set1_epi16(0); 162791037db265ecdd914a26e056cf69207b4f50924ehkuang 162891037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_unpacklo_epi16(in[15], in[0]); 162991037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_unpackhi_epi16(in[15], in[0]); 163091037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_unpacklo_epi16(in[13], in[2]); 163191037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_unpackhi_epi16(in[13], in[2]); 163291037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_unpacklo_epi16(in[11], in[4]); 163391037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_unpackhi_epi16(in[11], in[4]); 163491037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_unpacklo_epi16(in[9], in[6]); 163591037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_unpackhi_epi16(in[9], in[6]); 163691037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_unpacklo_epi16(in[7], in[8]); 163791037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_unpackhi_epi16(in[7], in[8]); 163891037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_unpacklo_epi16(in[5], in[10]); 163991037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_unpackhi_epi16(in[5], in[10]); 164091037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_unpacklo_epi16(in[3], in[12]); 164191037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_unpackhi_epi16(in[3], in[12]); 164291037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_unpacklo_epi16(in[1], in[14]); 164391037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_unpackhi_epi16(in[1], in[14]); 164491037db265ecdd914a26e056cf69207b4f50924ehkuang 164591037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); 164691037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); 164791037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); 164891037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); 164991037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); 165091037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); 165191037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); 165291037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); 165391037db265ecdd914a26e056cf69207b4f50924ehkuang v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); 165491037db265ecdd914a26e056cf69207b4f50924ehkuang v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); 165591037db265ecdd914a26e056cf69207b4f50924ehkuang v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); 165691037db265ecdd914a26e056cf69207b4f50924ehkuang v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); 165791037db265ecdd914a26e056cf69207b4f50924ehkuang v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); 165891037db265ecdd914a26e056cf69207b4f50924ehkuang v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); 165991037db265ecdd914a26e056cf69207b4f50924ehkuang v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); 166091037db265ecdd914a26e056cf69207b4f50924ehkuang v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); 166191037db265ecdd914a26e056cf69207b4f50924ehkuang v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); 166291037db265ecdd914a26e056cf69207b4f50924ehkuang v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); 166391037db265ecdd914a26e056cf69207b4f50924ehkuang v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); 166491037db265ecdd914a26e056cf69207b4f50924ehkuang v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); 166591037db265ecdd914a26e056cf69207b4f50924ehkuang v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); 166691037db265ecdd914a26e056cf69207b4f50924ehkuang v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); 166791037db265ecdd914a26e056cf69207b4f50924ehkuang v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); 166891037db265ecdd914a26e056cf69207b4f50924ehkuang v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); 166991037db265ecdd914a26e056cf69207b4f50924ehkuang v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); 167091037db265ecdd914a26e056cf69207b4f50924ehkuang v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); 167191037db265ecdd914a26e056cf69207b4f50924ehkuang v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); 167291037db265ecdd914a26e056cf69207b4f50924ehkuang v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); 167391037db265ecdd914a26e056cf69207b4f50924ehkuang v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); 167491037db265ecdd914a26e056cf69207b4f50924ehkuang v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); 167591037db265ecdd914a26e056cf69207b4f50924ehkuang v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); 167691037db265ecdd914a26e056cf69207b4f50924ehkuang v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); 167791037db265ecdd914a26e056cf69207b4f50924ehkuang 167891037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], v[16]); 167991037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(v[1], v[17]); 168091037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(v[2], v[18]); 168191037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(v[3], v[19]); 168291037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_add_epi32(v[4], v[20]); 168391037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_add_epi32(v[5], v[21]); 168491037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_add_epi32(v[6], v[22]); 168591037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_add_epi32(v[7], v[23]); 168691037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_add_epi32(v[8], v[24]); 168791037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_add_epi32(v[9], v[25]); 168891037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_add_epi32(v[10], v[26]); 168991037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_add_epi32(v[11], v[27]); 169091037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_add_epi32(v[12], v[28]); 169191037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_add_epi32(v[13], v[29]); 169291037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_add_epi32(v[14], v[30]); 169391037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_add_epi32(v[15], v[31]); 169491037db265ecdd914a26e056cf69207b4f50924ehkuang u[16] = _mm_sub_epi32(v[0], v[16]); 169591037db265ecdd914a26e056cf69207b4f50924ehkuang u[17] = _mm_sub_epi32(v[1], v[17]); 169691037db265ecdd914a26e056cf69207b4f50924ehkuang u[18] = _mm_sub_epi32(v[2], v[18]); 169791037db265ecdd914a26e056cf69207b4f50924ehkuang u[19] = _mm_sub_epi32(v[3], v[19]); 169891037db265ecdd914a26e056cf69207b4f50924ehkuang u[20] = _mm_sub_epi32(v[4], v[20]); 169991037db265ecdd914a26e056cf69207b4f50924ehkuang u[21] = _mm_sub_epi32(v[5], v[21]); 170091037db265ecdd914a26e056cf69207b4f50924ehkuang u[22] = _mm_sub_epi32(v[6], v[22]); 170191037db265ecdd914a26e056cf69207b4f50924ehkuang u[23] = _mm_sub_epi32(v[7], v[23]); 170291037db265ecdd914a26e056cf69207b4f50924ehkuang u[24] = _mm_sub_epi32(v[8], v[24]); 170391037db265ecdd914a26e056cf69207b4f50924ehkuang u[25] = _mm_sub_epi32(v[9], v[25]); 170491037db265ecdd914a26e056cf69207b4f50924ehkuang u[26] = _mm_sub_epi32(v[10], v[26]); 170591037db265ecdd914a26e056cf69207b4f50924ehkuang u[27] = _mm_sub_epi32(v[11], v[27]); 170691037db265ecdd914a26e056cf69207b4f50924ehkuang u[28] = _mm_sub_epi32(v[12], v[28]); 170791037db265ecdd914a26e056cf69207b4f50924ehkuang u[29] = _mm_sub_epi32(v[13], v[29]); 170891037db265ecdd914a26e056cf69207b4f50924ehkuang u[30] = _mm_sub_epi32(v[14], v[30]); 170991037db265ecdd914a26e056cf69207b4f50924ehkuang u[31] = _mm_sub_epi32(v[15], v[31]); 171091037db265ecdd914a26e056cf69207b4f50924ehkuang 171191037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 171291037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 171391037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 171491037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 171591037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 171691037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 171791037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 171891037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 171991037db265ecdd914a26e056cf69207b4f50924ehkuang v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 172091037db265ecdd914a26e056cf69207b4f50924ehkuang v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 172191037db265ecdd914a26e056cf69207b4f50924ehkuang v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 172291037db265ecdd914a26e056cf69207b4f50924ehkuang v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 172391037db265ecdd914a26e056cf69207b4f50924ehkuang v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 172491037db265ecdd914a26e056cf69207b4f50924ehkuang v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 172591037db265ecdd914a26e056cf69207b4f50924ehkuang v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 172691037db265ecdd914a26e056cf69207b4f50924ehkuang v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 172791037db265ecdd914a26e056cf69207b4f50924ehkuang v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); 172891037db265ecdd914a26e056cf69207b4f50924ehkuang v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); 172991037db265ecdd914a26e056cf69207b4f50924ehkuang v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); 173091037db265ecdd914a26e056cf69207b4f50924ehkuang v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); 173191037db265ecdd914a26e056cf69207b4f50924ehkuang v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); 173291037db265ecdd914a26e056cf69207b4f50924ehkuang v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); 173391037db265ecdd914a26e056cf69207b4f50924ehkuang v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); 173491037db265ecdd914a26e056cf69207b4f50924ehkuang v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); 173591037db265ecdd914a26e056cf69207b4f50924ehkuang v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); 173691037db265ecdd914a26e056cf69207b4f50924ehkuang v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); 173791037db265ecdd914a26e056cf69207b4f50924ehkuang v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); 173891037db265ecdd914a26e056cf69207b4f50924ehkuang v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); 173991037db265ecdd914a26e056cf69207b4f50924ehkuang v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); 174091037db265ecdd914a26e056cf69207b4f50924ehkuang v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); 174191037db265ecdd914a26e056cf69207b4f50924ehkuang v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); 174291037db265ecdd914a26e056cf69207b4f50924ehkuang v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); 174391037db265ecdd914a26e056cf69207b4f50924ehkuang 174491037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 174591037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 174691037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 174791037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 174891037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 174991037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 175091037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 175191037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 175291037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 175391037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 175491037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 175591037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 175691037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 175791037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 175891037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 175991037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 176091037db265ecdd914a26e056cf69207b4f50924ehkuang u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); 176191037db265ecdd914a26e056cf69207b4f50924ehkuang u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); 176291037db265ecdd914a26e056cf69207b4f50924ehkuang u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); 176391037db265ecdd914a26e056cf69207b4f50924ehkuang u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); 176491037db265ecdd914a26e056cf69207b4f50924ehkuang u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); 176591037db265ecdd914a26e056cf69207b4f50924ehkuang u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); 176691037db265ecdd914a26e056cf69207b4f50924ehkuang u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); 176791037db265ecdd914a26e056cf69207b4f50924ehkuang u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); 176891037db265ecdd914a26e056cf69207b4f50924ehkuang u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); 176991037db265ecdd914a26e056cf69207b4f50924ehkuang u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); 177091037db265ecdd914a26e056cf69207b4f50924ehkuang u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); 177191037db265ecdd914a26e056cf69207b4f50924ehkuang u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); 177291037db265ecdd914a26e056cf69207b4f50924ehkuang u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); 177391037db265ecdd914a26e056cf69207b4f50924ehkuang u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); 177491037db265ecdd914a26e056cf69207b4f50924ehkuang u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); 177591037db265ecdd914a26e056cf69207b4f50924ehkuang u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); 177691037db265ecdd914a26e056cf69207b4f50924ehkuang 177791037db265ecdd914a26e056cf69207b4f50924ehkuang s[0] = _mm_packs_epi32(u[0], u[1]); 177891037db265ecdd914a26e056cf69207b4f50924ehkuang s[1] = _mm_packs_epi32(u[2], u[3]); 177991037db265ecdd914a26e056cf69207b4f50924ehkuang s[2] = _mm_packs_epi32(u[4], u[5]); 178091037db265ecdd914a26e056cf69207b4f50924ehkuang s[3] = _mm_packs_epi32(u[6], u[7]); 178191037db265ecdd914a26e056cf69207b4f50924ehkuang s[4] = _mm_packs_epi32(u[8], u[9]); 178291037db265ecdd914a26e056cf69207b4f50924ehkuang s[5] = _mm_packs_epi32(u[10], u[11]); 178391037db265ecdd914a26e056cf69207b4f50924ehkuang s[6] = _mm_packs_epi32(u[12], u[13]); 178491037db265ecdd914a26e056cf69207b4f50924ehkuang s[7] = _mm_packs_epi32(u[14], u[15]); 178591037db265ecdd914a26e056cf69207b4f50924ehkuang s[8] = _mm_packs_epi32(u[16], u[17]); 178691037db265ecdd914a26e056cf69207b4f50924ehkuang s[9] = _mm_packs_epi32(u[18], u[19]); 178791037db265ecdd914a26e056cf69207b4f50924ehkuang s[10] = _mm_packs_epi32(u[20], u[21]); 178891037db265ecdd914a26e056cf69207b4f50924ehkuang s[11] = _mm_packs_epi32(u[22], u[23]); 178991037db265ecdd914a26e056cf69207b4f50924ehkuang s[12] = _mm_packs_epi32(u[24], u[25]); 179091037db265ecdd914a26e056cf69207b4f50924ehkuang s[13] = _mm_packs_epi32(u[26], u[27]); 179191037db265ecdd914a26e056cf69207b4f50924ehkuang s[14] = _mm_packs_epi32(u[28], u[29]); 179291037db265ecdd914a26e056cf69207b4f50924ehkuang s[15] = _mm_packs_epi32(u[30], u[31]); 179391037db265ecdd914a26e056cf69207b4f50924ehkuang 179491037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 2 179591037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_unpacklo_epi16(s[8], s[9]); 179691037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_unpackhi_epi16(s[8], s[9]); 179791037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_unpacklo_epi16(s[10], s[11]); 179891037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_unpackhi_epi16(s[10], s[11]); 179991037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_unpacklo_epi16(s[12], s[13]); 180091037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_unpackhi_epi16(s[12], s[13]); 180191037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_unpacklo_epi16(s[14], s[15]); 180291037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_unpackhi_epi16(s[14], s[15]); 180391037db265ecdd914a26e056cf69207b4f50924ehkuang 180491037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); 180591037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); 180691037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); 180791037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); 180891037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); 180991037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); 181091037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); 181191037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); 181291037db265ecdd914a26e056cf69207b4f50924ehkuang v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); 181391037db265ecdd914a26e056cf69207b4f50924ehkuang v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); 181491037db265ecdd914a26e056cf69207b4f50924ehkuang v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); 181591037db265ecdd914a26e056cf69207b4f50924ehkuang v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); 181691037db265ecdd914a26e056cf69207b4f50924ehkuang v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); 181791037db265ecdd914a26e056cf69207b4f50924ehkuang v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); 181891037db265ecdd914a26e056cf69207b4f50924ehkuang v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); 181991037db265ecdd914a26e056cf69207b4f50924ehkuang v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); 182091037db265ecdd914a26e056cf69207b4f50924ehkuang 182191037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], v[8]); 182291037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(v[1], v[9]); 182391037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(v[2], v[10]); 182491037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(v[3], v[11]); 182591037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_add_epi32(v[4], v[12]); 182691037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_add_epi32(v[5], v[13]); 182791037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_add_epi32(v[6], v[14]); 182891037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_add_epi32(v[7], v[15]); 182991037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_sub_epi32(v[0], v[8]); 183091037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_sub_epi32(v[1], v[9]); 183191037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_sub_epi32(v[2], v[10]); 183291037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_sub_epi32(v[3], v[11]); 183391037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_sub_epi32(v[4], v[12]); 183491037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_sub_epi32(v[5], v[13]); 183591037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_sub_epi32(v[6], v[14]); 183691037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_sub_epi32(v[7], v[15]); 183791037db265ecdd914a26e056cf69207b4f50924ehkuang 183891037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 183991037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 184091037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 184191037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 184291037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 184391037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 184491037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 184591037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 184691037db265ecdd914a26e056cf69207b4f50924ehkuang v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 184791037db265ecdd914a26e056cf69207b4f50924ehkuang v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 184891037db265ecdd914a26e056cf69207b4f50924ehkuang v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 184991037db265ecdd914a26e056cf69207b4f50924ehkuang v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 185091037db265ecdd914a26e056cf69207b4f50924ehkuang v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 185191037db265ecdd914a26e056cf69207b4f50924ehkuang v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 185291037db265ecdd914a26e056cf69207b4f50924ehkuang v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 185391037db265ecdd914a26e056cf69207b4f50924ehkuang v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 185491037db265ecdd914a26e056cf69207b4f50924ehkuang 185591037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 185691037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 185791037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 185891037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 185991037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 186091037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 186191037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 186291037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 186391037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 186491037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 186591037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 186691037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 186791037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 186891037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 186991037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 187091037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 187191037db265ecdd914a26e056cf69207b4f50924ehkuang 187291037db265ecdd914a26e056cf69207b4f50924ehkuang x[0] = _mm_add_epi16(s[0], s[4]); 187391037db265ecdd914a26e056cf69207b4f50924ehkuang x[1] = _mm_add_epi16(s[1], s[5]); 187491037db265ecdd914a26e056cf69207b4f50924ehkuang x[2] = _mm_add_epi16(s[2], s[6]); 187591037db265ecdd914a26e056cf69207b4f50924ehkuang x[3] = _mm_add_epi16(s[3], s[7]); 187691037db265ecdd914a26e056cf69207b4f50924ehkuang x[4] = _mm_sub_epi16(s[0], s[4]); 187791037db265ecdd914a26e056cf69207b4f50924ehkuang x[5] = _mm_sub_epi16(s[1], s[5]); 187891037db265ecdd914a26e056cf69207b4f50924ehkuang x[6] = _mm_sub_epi16(s[2], s[6]); 187991037db265ecdd914a26e056cf69207b4f50924ehkuang x[7] = _mm_sub_epi16(s[3], s[7]); 188091037db265ecdd914a26e056cf69207b4f50924ehkuang x[8] = _mm_packs_epi32(u[0], u[1]); 188191037db265ecdd914a26e056cf69207b4f50924ehkuang x[9] = _mm_packs_epi32(u[2], u[3]); 188291037db265ecdd914a26e056cf69207b4f50924ehkuang x[10] = _mm_packs_epi32(u[4], u[5]); 188391037db265ecdd914a26e056cf69207b4f50924ehkuang x[11] = _mm_packs_epi32(u[6], u[7]); 188491037db265ecdd914a26e056cf69207b4f50924ehkuang x[12] = _mm_packs_epi32(u[8], u[9]); 188591037db265ecdd914a26e056cf69207b4f50924ehkuang x[13] = _mm_packs_epi32(u[10], u[11]); 188691037db265ecdd914a26e056cf69207b4f50924ehkuang x[14] = _mm_packs_epi32(u[12], u[13]); 188791037db265ecdd914a26e056cf69207b4f50924ehkuang x[15] = _mm_packs_epi32(u[14], u[15]); 188891037db265ecdd914a26e056cf69207b4f50924ehkuang 188991037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 3 189091037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_unpacklo_epi16(x[4], x[5]); 189191037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_unpackhi_epi16(x[4], x[5]); 189291037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_unpacklo_epi16(x[6], x[7]); 189391037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_unpackhi_epi16(x[6], x[7]); 189491037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_unpacklo_epi16(x[12], x[13]); 189591037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_unpackhi_epi16(x[12], x[13]); 189691037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_unpacklo_epi16(x[14], x[15]); 189791037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_unpackhi_epi16(x[14], x[15]); 189891037db265ecdd914a26e056cf69207b4f50924ehkuang 189991037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); 190091037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); 190191037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); 190291037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); 190391037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); 190491037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); 190591037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); 190691037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); 190791037db265ecdd914a26e056cf69207b4f50924ehkuang v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); 190891037db265ecdd914a26e056cf69207b4f50924ehkuang v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); 190991037db265ecdd914a26e056cf69207b4f50924ehkuang v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); 191091037db265ecdd914a26e056cf69207b4f50924ehkuang v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); 191191037db265ecdd914a26e056cf69207b4f50924ehkuang v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); 191291037db265ecdd914a26e056cf69207b4f50924ehkuang v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); 191391037db265ecdd914a26e056cf69207b4f50924ehkuang v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); 191491037db265ecdd914a26e056cf69207b4f50924ehkuang v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); 191591037db265ecdd914a26e056cf69207b4f50924ehkuang 191691037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], v[4]); 191791037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(v[1], v[5]); 191891037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(v[2], v[6]); 191991037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(v[3], v[7]); 192091037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_sub_epi32(v[0], v[4]); 192191037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_sub_epi32(v[1], v[5]); 192291037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_sub_epi32(v[2], v[6]); 192391037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_sub_epi32(v[3], v[7]); 192491037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_add_epi32(v[8], v[12]); 192591037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_add_epi32(v[9], v[13]); 192691037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_add_epi32(v[10], v[14]); 192791037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_add_epi32(v[11], v[15]); 192891037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_sub_epi32(v[8], v[12]); 192991037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_sub_epi32(v[9], v[13]); 193091037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_sub_epi32(v[10], v[14]); 193191037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_sub_epi32(v[11], v[15]); 193291037db265ecdd914a26e056cf69207b4f50924ehkuang 193391037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 193491037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 193591037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 193691037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 193791037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 193891037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 193991037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 194091037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 194191037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 194291037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 194391037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 194491037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 194591037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 194691037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 194791037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 194891037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 194991037db265ecdd914a26e056cf69207b4f50924ehkuang 195091037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 195191037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 195291037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 195391037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 195491037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 195591037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 195691037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 195791037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 195891037db265ecdd914a26e056cf69207b4f50924ehkuang v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 195991037db265ecdd914a26e056cf69207b4f50924ehkuang v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 196091037db265ecdd914a26e056cf69207b4f50924ehkuang v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 196191037db265ecdd914a26e056cf69207b4f50924ehkuang v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 196291037db265ecdd914a26e056cf69207b4f50924ehkuang v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 196391037db265ecdd914a26e056cf69207b4f50924ehkuang v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 196491037db265ecdd914a26e056cf69207b4f50924ehkuang v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 196591037db265ecdd914a26e056cf69207b4f50924ehkuang v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 196691037db265ecdd914a26e056cf69207b4f50924ehkuang 196791037db265ecdd914a26e056cf69207b4f50924ehkuang s[0] = _mm_add_epi16(x[0], x[2]); 196891037db265ecdd914a26e056cf69207b4f50924ehkuang s[1] = _mm_add_epi16(x[1], x[3]); 196991037db265ecdd914a26e056cf69207b4f50924ehkuang s[2] = _mm_sub_epi16(x[0], x[2]); 197091037db265ecdd914a26e056cf69207b4f50924ehkuang s[3] = _mm_sub_epi16(x[1], x[3]); 197191037db265ecdd914a26e056cf69207b4f50924ehkuang s[4] = _mm_packs_epi32(v[0], v[1]); 197291037db265ecdd914a26e056cf69207b4f50924ehkuang s[5] = _mm_packs_epi32(v[2], v[3]); 197391037db265ecdd914a26e056cf69207b4f50924ehkuang s[6] = _mm_packs_epi32(v[4], v[5]); 197491037db265ecdd914a26e056cf69207b4f50924ehkuang s[7] = _mm_packs_epi32(v[6], v[7]); 197591037db265ecdd914a26e056cf69207b4f50924ehkuang s[8] = _mm_add_epi16(x[8], x[10]); 197691037db265ecdd914a26e056cf69207b4f50924ehkuang s[9] = _mm_add_epi16(x[9], x[11]); 197791037db265ecdd914a26e056cf69207b4f50924ehkuang s[10] = _mm_sub_epi16(x[8], x[10]); 197891037db265ecdd914a26e056cf69207b4f50924ehkuang s[11] = _mm_sub_epi16(x[9], x[11]); 197991037db265ecdd914a26e056cf69207b4f50924ehkuang s[12] = _mm_packs_epi32(v[8], v[9]); 198091037db265ecdd914a26e056cf69207b4f50924ehkuang s[13] = _mm_packs_epi32(v[10], v[11]); 198191037db265ecdd914a26e056cf69207b4f50924ehkuang s[14] = _mm_packs_epi32(v[12], v[13]); 198291037db265ecdd914a26e056cf69207b4f50924ehkuang s[15] = _mm_packs_epi32(v[14], v[15]); 198391037db265ecdd914a26e056cf69207b4f50924ehkuang 198491037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 4 198591037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_unpacklo_epi16(s[2], s[3]); 198691037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_unpackhi_epi16(s[2], s[3]); 198791037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_unpacklo_epi16(s[6], s[7]); 198891037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_unpackhi_epi16(s[6], s[7]); 198991037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_unpacklo_epi16(s[10], s[11]); 199091037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_unpackhi_epi16(s[10], s[11]); 199191037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_unpacklo_epi16(s[14], s[15]); 199291037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_unpackhi_epi16(s[14], s[15]); 199391037db265ecdd914a26e056cf69207b4f50924ehkuang 199491037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); 199591037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); 199691037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 199791037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); 199891037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 199991037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 200091037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 200191037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 200291037db265ecdd914a26e056cf69207b4f50924ehkuang v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); 200391037db265ecdd914a26e056cf69207b4f50924ehkuang v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); 200491037db265ecdd914a26e056cf69207b4f50924ehkuang v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); 200591037db265ecdd914a26e056cf69207b4f50924ehkuang v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); 200691037db265ecdd914a26e056cf69207b4f50924ehkuang v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); 200791037db265ecdd914a26e056cf69207b4f50924ehkuang v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); 200891037db265ecdd914a26e056cf69207b4f50924ehkuang v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); 200991037db265ecdd914a26e056cf69207b4f50924ehkuang v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); 201091037db265ecdd914a26e056cf69207b4f50924ehkuang 201191037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 201291037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 201391037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 201491037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 201591037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 201691037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 201791037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 201891037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 201991037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 202091037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 202191037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 202291037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 202391037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 202491037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 202591037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 202691037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 202791037db265ecdd914a26e056cf69207b4f50924ehkuang 202891037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 202991037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 203091037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 203191037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 203291037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 203391037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 203491037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 203591037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 203691037db265ecdd914a26e056cf69207b4f50924ehkuang v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 203791037db265ecdd914a26e056cf69207b4f50924ehkuang v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 203891037db265ecdd914a26e056cf69207b4f50924ehkuang v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 203991037db265ecdd914a26e056cf69207b4f50924ehkuang v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 204091037db265ecdd914a26e056cf69207b4f50924ehkuang v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 204191037db265ecdd914a26e056cf69207b4f50924ehkuang v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 204291037db265ecdd914a26e056cf69207b4f50924ehkuang v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 204391037db265ecdd914a26e056cf69207b4f50924ehkuang v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 204491037db265ecdd914a26e056cf69207b4f50924ehkuang 204591037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = s[0]; 204691037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_sub_epi16(kZero, s[8]); 204791037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = s[12]; 204891037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_sub_epi16(kZero, s[4]); 204991037db265ecdd914a26e056cf69207b4f50924ehkuang in[4] = _mm_packs_epi32(v[4], v[5]); 205091037db265ecdd914a26e056cf69207b4f50924ehkuang in[5] = _mm_packs_epi32(v[12], v[13]); 205191037db265ecdd914a26e056cf69207b4f50924ehkuang in[6] = _mm_packs_epi32(v[8], v[9]); 205291037db265ecdd914a26e056cf69207b4f50924ehkuang in[7] = _mm_packs_epi32(v[0], v[1]); 205391037db265ecdd914a26e056cf69207b4f50924ehkuang in[8] = _mm_packs_epi32(v[2], v[3]); 205491037db265ecdd914a26e056cf69207b4f50924ehkuang in[9] = _mm_packs_epi32(v[10], v[11]); 205591037db265ecdd914a26e056cf69207b4f50924ehkuang in[10] = _mm_packs_epi32(v[14], v[15]); 205691037db265ecdd914a26e056cf69207b4f50924ehkuang in[11] = _mm_packs_epi32(v[6], v[7]); 205791037db265ecdd914a26e056cf69207b4f50924ehkuang in[12] = s[5]; 205891037db265ecdd914a26e056cf69207b4f50924ehkuang in[13] = _mm_sub_epi16(kZero, s[13]); 205991037db265ecdd914a26e056cf69207b4f50924ehkuang in[14] = s[9]; 206091037db265ecdd914a26e056cf69207b4f50924ehkuang in[15] = _mm_sub_epi16(kZero, s[1]); 206191037db265ecdd914a26e056cf69207b4f50924ehkuang} 206291037db265ecdd914a26e056cf69207b4f50924ehkuang 2063b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void idct16_8col(__m128i *in) { 206491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); 206591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); 206691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); 206791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); 206891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); 206991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); 207091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); 207191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); 207291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); 207391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); 207491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); 207591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); 207691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 207791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 207891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 207991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 208091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 208191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 208291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 208391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 208491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 208591037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i v[16], u[16], s[16], t[16]; 208691037db265ecdd914a26e056cf69207b4f50924ehkuang 208791037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 1 208891037db265ecdd914a26e056cf69207b4f50924ehkuang s[0] = in[0]; 208991037db265ecdd914a26e056cf69207b4f50924ehkuang s[1] = in[8]; 209091037db265ecdd914a26e056cf69207b4f50924ehkuang s[2] = in[4]; 209191037db265ecdd914a26e056cf69207b4f50924ehkuang s[3] = in[12]; 209291037db265ecdd914a26e056cf69207b4f50924ehkuang s[4] = in[2]; 209391037db265ecdd914a26e056cf69207b4f50924ehkuang s[5] = in[10]; 209491037db265ecdd914a26e056cf69207b4f50924ehkuang s[6] = in[6]; 209591037db265ecdd914a26e056cf69207b4f50924ehkuang s[7] = in[14]; 209691037db265ecdd914a26e056cf69207b4f50924ehkuang s[8] = in[1]; 209791037db265ecdd914a26e056cf69207b4f50924ehkuang s[9] = in[9]; 209891037db265ecdd914a26e056cf69207b4f50924ehkuang s[10] = in[5]; 209991037db265ecdd914a26e056cf69207b4f50924ehkuang s[11] = in[13]; 210091037db265ecdd914a26e056cf69207b4f50924ehkuang s[12] = in[3]; 210191037db265ecdd914a26e056cf69207b4f50924ehkuang s[13] = in[11]; 210291037db265ecdd914a26e056cf69207b4f50924ehkuang s[14] = in[7]; 210391037db265ecdd914a26e056cf69207b4f50924ehkuang s[15] = in[15]; 210491037db265ecdd914a26e056cf69207b4f50924ehkuang 210591037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 2 210691037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_unpacklo_epi16(s[8], s[15]); 210791037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_unpackhi_epi16(s[8], s[15]); 210891037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_unpacklo_epi16(s[9], s[14]); 210991037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_unpackhi_epi16(s[9], s[14]); 211091037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_unpacklo_epi16(s[10], s[13]); 211191037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_unpackhi_epi16(s[10], s[13]); 211291037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_unpacklo_epi16(s[11], s[12]); 211391037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_unpackhi_epi16(s[11], s[12]); 211491037db265ecdd914a26e056cf69207b4f50924ehkuang 211591037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02); 211691037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02); 211791037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30); 211891037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30); 211991037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18); 212091037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18); 212191037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14); 212291037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14); 212391037db265ecdd914a26e056cf69207b4f50924ehkuang v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10); 212491037db265ecdd914a26e056cf69207b4f50924ehkuang v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10); 212591037db265ecdd914a26e056cf69207b4f50924ehkuang v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22); 212691037db265ecdd914a26e056cf69207b4f50924ehkuang v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22); 212791037db265ecdd914a26e056cf69207b4f50924ehkuang v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26); 212891037db265ecdd914a26e056cf69207b4f50924ehkuang v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26); 212991037db265ecdd914a26e056cf69207b4f50924ehkuang v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06); 213091037db265ecdd914a26e056cf69207b4f50924ehkuang v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06); 213191037db265ecdd914a26e056cf69207b4f50924ehkuang 213291037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 213391037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 213491037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 213591037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 213691037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 213791037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 213891037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 213991037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 214091037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 214191037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 214291037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 214391037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 214491037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 214591037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 214691037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 214791037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 214891037db265ecdd914a26e056cf69207b4f50924ehkuang 214991037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 215091037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 215191037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 215291037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 215391037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 215491037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 215591037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 215691037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 215791037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 215891037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 215991037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 216091037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 216191037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 216291037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 216391037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 216491037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 216591037db265ecdd914a26e056cf69207b4f50924ehkuang 216691037db265ecdd914a26e056cf69207b4f50924ehkuang s[8] = _mm_packs_epi32(u[0], u[1]); 216791037db265ecdd914a26e056cf69207b4f50924ehkuang s[15] = _mm_packs_epi32(u[2], u[3]); 216891037db265ecdd914a26e056cf69207b4f50924ehkuang s[9] = _mm_packs_epi32(u[4], u[5]); 216991037db265ecdd914a26e056cf69207b4f50924ehkuang s[14] = _mm_packs_epi32(u[6], u[7]); 217091037db265ecdd914a26e056cf69207b4f50924ehkuang s[10] = _mm_packs_epi32(u[8], u[9]); 217191037db265ecdd914a26e056cf69207b4f50924ehkuang s[13] = _mm_packs_epi32(u[10], u[11]); 217291037db265ecdd914a26e056cf69207b4f50924ehkuang s[11] = _mm_packs_epi32(u[12], u[13]); 217391037db265ecdd914a26e056cf69207b4f50924ehkuang s[12] = _mm_packs_epi32(u[14], u[15]); 217491037db265ecdd914a26e056cf69207b4f50924ehkuang 217591037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 3 217691037db265ecdd914a26e056cf69207b4f50924ehkuang t[0] = s[0]; 217791037db265ecdd914a26e056cf69207b4f50924ehkuang t[1] = s[1]; 217891037db265ecdd914a26e056cf69207b4f50924ehkuang t[2] = s[2]; 217991037db265ecdd914a26e056cf69207b4f50924ehkuang t[3] = s[3]; 218091037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_unpacklo_epi16(s[4], s[7]); 218191037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_unpackhi_epi16(s[4], s[7]); 218291037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_unpacklo_epi16(s[5], s[6]); 218391037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_unpackhi_epi16(s[5], s[6]); 218491037db265ecdd914a26e056cf69207b4f50924ehkuang 218591037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04); 218691037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04); 218791037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28); 218891037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28); 218991037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20); 219091037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20); 219191037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12); 219291037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12); 219391037db265ecdd914a26e056cf69207b4f50924ehkuang 219491037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 219591037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 219691037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 219791037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 219891037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 219991037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 220091037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 220191037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 220291037db265ecdd914a26e056cf69207b4f50924ehkuang 220391037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 220491037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 220591037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 220691037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 220791037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 220891037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 220991037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 221091037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 221191037db265ecdd914a26e056cf69207b4f50924ehkuang 221291037db265ecdd914a26e056cf69207b4f50924ehkuang t[4] = _mm_packs_epi32(u[0], u[1]); 221391037db265ecdd914a26e056cf69207b4f50924ehkuang t[7] = _mm_packs_epi32(u[2], u[3]); 221491037db265ecdd914a26e056cf69207b4f50924ehkuang t[5] = _mm_packs_epi32(u[4], u[5]); 221591037db265ecdd914a26e056cf69207b4f50924ehkuang t[6] = _mm_packs_epi32(u[6], u[7]); 221691037db265ecdd914a26e056cf69207b4f50924ehkuang t[8] = _mm_add_epi16(s[8], s[9]); 221791037db265ecdd914a26e056cf69207b4f50924ehkuang t[9] = _mm_sub_epi16(s[8], s[9]); 221891037db265ecdd914a26e056cf69207b4f50924ehkuang t[10] = _mm_sub_epi16(s[11], s[10]); 221991037db265ecdd914a26e056cf69207b4f50924ehkuang t[11] = _mm_add_epi16(s[10], s[11]); 222091037db265ecdd914a26e056cf69207b4f50924ehkuang t[12] = _mm_add_epi16(s[12], s[13]); 222191037db265ecdd914a26e056cf69207b4f50924ehkuang t[13] = _mm_sub_epi16(s[12], s[13]); 222291037db265ecdd914a26e056cf69207b4f50924ehkuang t[14] = _mm_sub_epi16(s[15], s[14]); 222391037db265ecdd914a26e056cf69207b4f50924ehkuang t[15] = _mm_add_epi16(s[14], s[15]); 222491037db265ecdd914a26e056cf69207b4f50924ehkuang 222591037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 4 222691037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_unpacklo_epi16(t[0], t[1]); 222791037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_unpackhi_epi16(t[0], t[1]); 222891037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_unpacklo_epi16(t[2], t[3]); 222991037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_unpackhi_epi16(t[2], t[3]); 223091037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_unpacklo_epi16(t[9], t[14]); 223191037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_unpackhi_epi16(t[9], t[14]); 223291037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_unpacklo_epi16(t[10], t[13]); 223391037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_unpackhi_epi16(t[10], t[13]); 223491037db265ecdd914a26e056cf69207b4f50924ehkuang 223591037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 223691037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 223791037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 223891037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); 223991037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08); 224091037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08); 224191037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); 224291037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); 224391037db265ecdd914a26e056cf69207b4f50924ehkuang v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24); 224491037db265ecdd914a26e056cf69207b4f50924ehkuang v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24); 224591037db265ecdd914a26e056cf69207b4f50924ehkuang v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08); 224691037db265ecdd914a26e056cf69207b4f50924ehkuang v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08); 224791037db265ecdd914a26e056cf69207b4f50924ehkuang v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08); 224891037db265ecdd914a26e056cf69207b4f50924ehkuang v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08); 224991037db265ecdd914a26e056cf69207b4f50924ehkuang v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24); 225091037db265ecdd914a26e056cf69207b4f50924ehkuang v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24); 225191037db265ecdd914a26e056cf69207b4f50924ehkuang 225291037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 225391037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 225491037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 225591037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 225691037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 225791037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 225891037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 225991037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 226091037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 226191037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 226291037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 226391037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 226491037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 226591037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 226691037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 226791037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 226891037db265ecdd914a26e056cf69207b4f50924ehkuang 226991037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 227091037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 227191037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 227291037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 227391037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 227491037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 227591037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 227691037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 227791037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 227891037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 227991037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 228091037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 228191037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 228291037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 228391037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 228491037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 228591037db265ecdd914a26e056cf69207b4f50924ehkuang 228691037db265ecdd914a26e056cf69207b4f50924ehkuang s[0] = _mm_packs_epi32(u[0], u[1]); 228791037db265ecdd914a26e056cf69207b4f50924ehkuang s[1] = _mm_packs_epi32(u[2], u[3]); 228891037db265ecdd914a26e056cf69207b4f50924ehkuang s[2] = _mm_packs_epi32(u[4], u[5]); 228991037db265ecdd914a26e056cf69207b4f50924ehkuang s[3] = _mm_packs_epi32(u[6], u[7]); 229091037db265ecdd914a26e056cf69207b4f50924ehkuang s[4] = _mm_add_epi16(t[4], t[5]); 229191037db265ecdd914a26e056cf69207b4f50924ehkuang s[5] = _mm_sub_epi16(t[4], t[5]); 229291037db265ecdd914a26e056cf69207b4f50924ehkuang s[6] = _mm_sub_epi16(t[7], t[6]); 229391037db265ecdd914a26e056cf69207b4f50924ehkuang s[7] = _mm_add_epi16(t[6], t[7]); 229491037db265ecdd914a26e056cf69207b4f50924ehkuang s[8] = t[8]; 229591037db265ecdd914a26e056cf69207b4f50924ehkuang s[15] = t[15]; 229691037db265ecdd914a26e056cf69207b4f50924ehkuang s[9] = _mm_packs_epi32(u[8], u[9]); 229791037db265ecdd914a26e056cf69207b4f50924ehkuang s[14] = _mm_packs_epi32(u[10], u[11]); 229891037db265ecdd914a26e056cf69207b4f50924ehkuang s[10] = _mm_packs_epi32(u[12], u[13]); 229991037db265ecdd914a26e056cf69207b4f50924ehkuang s[13] = _mm_packs_epi32(u[14], u[15]); 230091037db265ecdd914a26e056cf69207b4f50924ehkuang s[11] = t[11]; 230191037db265ecdd914a26e056cf69207b4f50924ehkuang s[12] = t[12]; 230291037db265ecdd914a26e056cf69207b4f50924ehkuang 230391037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 5 230491037db265ecdd914a26e056cf69207b4f50924ehkuang t[0] = _mm_add_epi16(s[0], s[3]); 230591037db265ecdd914a26e056cf69207b4f50924ehkuang t[1] = _mm_add_epi16(s[1], s[2]); 230691037db265ecdd914a26e056cf69207b4f50924ehkuang t[2] = _mm_sub_epi16(s[1], s[2]); 230791037db265ecdd914a26e056cf69207b4f50924ehkuang t[3] = _mm_sub_epi16(s[0], s[3]); 230891037db265ecdd914a26e056cf69207b4f50924ehkuang t[4] = s[4]; 230991037db265ecdd914a26e056cf69207b4f50924ehkuang t[7] = s[7]; 231091037db265ecdd914a26e056cf69207b4f50924ehkuang 231191037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_unpacklo_epi16(s[5], s[6]); 231291037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_unpackhi_epi16(s[5], s[6]); 231391037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 231491037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 231591037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 231691037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 231791037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 231891037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 231991037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 232091037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 232191037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 232291037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 232391037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 232491037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 232591037db265ecdd914a26e056cf69207b4f50924ehkuang t[5] = _mm_packs_epi32(u[0], u[1]); 232691037db265ecdd914a26e056cf69207b4f50924ehkuang t[6] = _mm_packs_epi32(u[2], u[3]); 232791037db265ecdd914a26e056cf69207b4f50924ehkuang 232891037db265ecdd914a26e056cf69207b4f50924ehkuang t[8] = _mm_add_epi16(s[8], s[11]); 232991037db265ecdd914a26e056cf69207b4f50924ehkuang t[9] = _mm_add_epi16(s[9], s[10]); 233091037db265ecdd914a26e056cf69207b4f50924ehkuang t[10] = _mm_sub_epi16(s[9], s[10]); 233191037db265ecdd914a26e056cf69207b4f50924ehkuang t[11] = _mm_sub_epi16(s[8], s[11]); 233291037db265ecdd914a26e056cf69207b4f50924ehkuang t[12] = _mm_sub_epi16(s[15], s[12]); 233391037db265ecdd914a26e056cf69207b4f50924ehkuang t[13] = _mm_sub_epi16(s[14], s[13]); 233491037db265ecdd914a26e056cf69207b4f50924ehkuang t[14] = _mm_add_epi16(s[13], s[14]); 233591037db265ecdd914a26e056cf69207b4f50924ehkuang t[15] = _mm_add_epi16(s[12], s[15]); 233691037db265ecdd914a26e056cf69207b4f50924ehkuang 233791037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 6 233891037db265ecdd914a26e056cf69207b4f50924ehkuang s[0] = _mm_add_epi16(t[0], t[7]); 233991037db265ecdd914a26e056cf69207b4f50924ehkuang s[1] = _mm_add_epi16(t[1], t[6]); 234091037db265ecdd914a26e056cf69207b4f50924ehkuang s[2] = _mm_add_epi16(t[2], t[5]); 234191037db265ecdd914a26e056cf69207b4f50924ehkuang s[3] = _mm_add_epi16(t[3], t[4]); 234291037db265ecdd914a26e056cf69207b4f50924ehkuang s[4] = _mm_sub_epi16(t[3], t[4]); 234391037db265ecdd914a26e056cf69207b4f50924ehkuang s[5] = _mm_sub_epi16(t[2], t[5]); 234491037db265ecdd914a26e056cf69207b4f50924ehkuang s[6] = _mm_sub_epi16(t[1], t[6]); 234591037db265ecdd914a26e056cf69207b4f50924ehkuang s[7] = _mm_sub_epi16(t[0], t[7]); 234691037db265ecdd914a26e056cf69207b4f50924ehkuang s[8] = t[8]; 234791037db265ecdd914a26e056cf69207b4f50924ehkuang s[9] = t[9]; 234891037db265ecdd914a26e056cf69207b4f50924ehkuang 234991037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_unpacklo_epi16(t[10], t[13]); 235091037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_unpackhi_epi16(t[10], t[13]); 235191037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_unpacklo_epi16(t[11], t[12]); 235291037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_unpackhi_epi16(t[11], t[12]); 235391037db265ecdd914a26e056cf69207b4f50924ehkuang 235491037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 235591037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 235691037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 235791037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 235891037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 235991037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 236091037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 236191037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 236291037db265ecdd914a26e056cf69207b4f50924ehkuang 236391037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 236491037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 236591037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 236691037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 236791037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 236891037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 236991037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 237091037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 237191037db265ecdd914a26e056cf69207b4f50924ehkuang 237291037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 237391037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 237491037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 237591037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 237691037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 237791037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 237891037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 237991037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 238091037db265ecdd914a26e056cf69207b4f50924ehkuang 238191037db265ecdd914a26e056cf69207b4f50924ehkuang s[10] = _mm_packs_epi32(u[0], u[1]); 238291037db265ecdd914a26e056cf69207b4f50924ehkuang s[13] = _mm_packs_epi32(u[2], u[3]); 238391037db265ecdd914a26e056cf69207b4f50924ehkuang s[11] = _mm_packs_epi32(u[4], u[5]); 238491037db265ecdd914a26e056cf69207b4f50924ehkuang s[12] = _mm_packs_epi32(u[6], u[7]); 238591037db265ecdd914a26e056cf69207b4f50924ehkuang s[14] = t[14]; 238691037db265ecdd914a26e056cf69207b4f50924ehkuang s[15] = t[15]; 238791037db265ecdd914a26e056cf69207b4f50924ehkuang 238891037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 7 238991037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_add_epi16(s[0], s[15]); 239091037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_add_epi16(s[1], s[14]); 239191037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = _mm_add_epi16(s[2], s[13]); 239291037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_add_epi16(s[3], s[12]); 239391037db265ecdd914a26e056cf69207b4f50924ehkuang in[4] = _mm_add_epi16(s[4], s[11]); 239491037db265ecdd914a26e056cf69207b4f50924ehkuang in[5] = _mm_add_epi16(s[5], s[10]); 239591037db265ecdd914a26e056cf69207b4f50924ehkuang in[6] = _mm_add_epi16(s[6], s[9]); 239691037db265ecdd914a26e056cf69207b4f50924ehkuang in[7] = _mm_add_epi16(s[7], s[8]); 239791037db265ecdd914a26e056cf69207b4f50924ehkuang in[8] = _mm_sub_epi16(s[7], s[8]); 239891037db265ecdd914a26e056cf69207b4f50924ehkuang in[9] = _mm_sub_epi16(s[6], s[9]); 239991037db265ecdd914a26e056cf69207b4f50924ehkuang in[10] = _mm_sub_epi16(s[5], s[10]); 240091037db265ecdd914a26e056cf69207b4f50924ehkuang in[11] = _mm_sub_epi16(s[4], s[11]); 240191037db265ecdd914a26e056cf69207b4f50924ehkuang in[12] = _mm_sub_epi16(s[3], s[12]); 240291037db265ecdd914a26e056cf69207b4f50924ehkuang in[13] = _mm_sub_epi16(s[2], s[13]); 240391037db265ecdd914a26e056cf69207b4f50924ehkuang in[14] = _mm_sub_epi16(s[1], s[14]); 240491037db265ecdd914a26e056cf69207b4f50924ehkuang in[15] = _mm_sub_epi16(s[0], s[15]); 240591037db265ecdd914a26e056cf69207b4f50924ehkuang} 240691037db265ecdd914a26e056cf69207b4f50924ehkuang 2407b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void idct16_sse2(__m128i *in0, __m128i *in1) { 240891037db265ecdd914a26e056cf69207b4f50924ehkuang array_transpose_16x16(in0, in1); 2409b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian idct16_8col(in0); 2410b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian idct16_8col(in1); 241191037db265ecdd914a26e056cf69207b4f50924ehkuang} 241291037db265ecdd914a26e056cf69207b4f50924ehkuang 2413b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void iadst16_sse2(__m128i *in0, __m128i *in1) { 241491037db265ecdd914a26e056cf69207b4f50924ehkuang array_transpose_16x16(in0, in1); 2415b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian iadst16_8col(in0); 2416b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian iadst16_8col(in1); 241791037db265ecdd914a26e056cf69207b4f50924ehkuang} 241891037db265ecdd914a26e056cf69207b4f50924ehkuang 24195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) { 24205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16)); 24215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16)); 24225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16)); 24235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16)); 24245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16)); 24255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16)); 24265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16)); 24275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16)); 24285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 24295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16)); 24305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16)); 24315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16)); 24325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16)); 24335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16)); 24345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16)); 24355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16)); 24365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16)); 243791037db265ecdd914a26e056cf69207b4f50924ehkuang} 243891037db265ecdd914a26e056cf69207b4f50924ehkuang 243991037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { 244091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i final_rounding = _mm_set1_epi16(1<<5); 244191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i zero = _mm_setzero_si128(); 244291037db265ecdd914a26e056cf69207b4f50924ehkuang // Final rounding and shift 244391037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_adds_epi16(in[0], final_rounding); 244491037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_adds_epi16(in[1], final_rounding); 244591037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = _mm_adds_epi16(in[2], final_rounding); 244691037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_adds_epi16(in[3], final_rounding); 244791037db265ecdd914a26e056cf69207b4f50924ehkuang in[4] = _mm_adds_epi16(in[4], final_rounding); 244891037db265ecdd914a26e056cf69207b4f50924ehkuang in[5] = _mm_adds_epi16(in[5], final_rounding); 244991037db265ecdd914a26e056cf69207b4f50924ehkuang in[6] = _mm_adds_epi16(in[6], final_rounding); 245091037db265ecdd914a26e056cf69207b4f50924ehkuang in[7] = _mm_adds_epi16(in[7], final_rounding); 245191037db265ecdd914a26e056cf69207b4f50924ehkuang in[8] = _mm_adds_epi16(in[8], final_rounding); 245291037db265ecdd914a26e056cf69207b4f50924ehkuang in[9] = _mm_adds_epi16(in[9], final_rounding); 245391037db265ecdd914a26e056cf69207b4f50924ehkuang in[10] = _mm_adds_epi16(in[10], final_rounding); 245491037db265ecdd914a26e056cf69207b4f50924ehkuang in[11] = _mm_adds_epi16(in[11], final_rounding); 245591037db265ecdd914a26e056cf69207b4f50924ehkuang in[12] = _mm_adds_epi16(in[12], final_rounding); 245691037db265ecdd914a26e056cf69207b4f50924ehkuang in[13] = _mm_adds_epi16(in[13], final_rounding); 245791037db265ecdd914a26e056cf69207b4f50924ehkuang in[14] = _mm_adds_epi16(in[14], final_rounding); 245891037db265ecdd914a26e056cf69207b4f50924ehkuang in[15] = _mm_adds_epi16(in[15], final_rounding); 245991037db265ecdd914a26e056cf69207b4f50924ehkuang 246091037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_srai_epi16(in[0], 6); 246191037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_srai_epi16(in[1], 6); 246291037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = _mm_srai_epi16(in[2], 6); 246391037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_srai_epi16(in[3], 6); 246491037db265ecdd914a26e056cf69207b4f50924ehkuang in[4] = _mm_srai_epi16(in[4], 6); 246591037db265ecdd914a26e056cf69207b4f50924ehkuang in[5] = _mm_srai_epi16(in[5], 6); 246691037db265ecdd914a26e056cf69207b4f50924ehkuang in[6] = _mm_srai_epi16(in[6], 6); 246791037db265ecdd914a26e056cf69207b4f50924ehkuang in[7] = _mm_srai_epi16(in[7], 6); 246891037db265ecdd914a26e056cf69207b4f50924ehkuang in[8] = _mm_srai_epi16(in[8], 6); 246991037db265ecdd914a26e056cf69207b4f50924ehkuang in[9] = _mm_srai_epi16(in[9], 6); 247091037db265ecdd914a26e056cf69207b4f50924ehkuang in[10] = _mm_srai_epi16(in[10], 6); 247191037db265ecdd914a26e056cf69207b4f50924ehkuang in[11] = _mm_srai_epi16(in[11], 6); 247291037db265ecdd914a26e056cf69207b4f50924ehkuang in[12] = _mm_srai_epi16(in[12], 6); 247391037db265ecdd914a26e056cf69207b4f50924ehkuang in[13] = _mm_srai_epi16(in[13], 6); 247491037db265ecdd914a26e056cf69207b4f50924ehkuang in[14] = _mm_srai_epi16(in[14], 6); 247591037db265ecdd914a26e056cf69207b4f50924ehkuang in[15] = _mm_srai_epi16(in[15], 6); 247691037db265ecdd914a26e056cf69207b4f50924ehkuang 247791037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[0]); 247891037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[1]); 247991037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[2]); 248091037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[3]); 248191037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[4]); 248291037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[5]); 248391037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[6]); 248491037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[7]); 248591037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[8]); 248691037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[9]); 248791037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[10]); 248891037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[11]); 248991037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[12]); 249091037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[13]); 249191037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[14]); 249291037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[15]); 249391037db265ecdd914a26e056cf69207b4f50924ehkuang} 249491037db265ecdd914a26e056cf69207b4f50924ehkuang 24955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride, 24965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int tx_type) { 249791037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in0[16], in1[16]; 249891037db265ecdd914a26e056cf69207b4f50924ehkuang 249991037db265ecdd914a26e056cf69207b4f50924ehkuang load_buffer_8x16(input, in0); 250091037db265ecdd914a26e056cf69207b4f50924ehkuang input += 8; 250191037db265ecdd914a26e056cf69207b4f50924ehkuang load_buffer_8x16(input, in1); 250291037db265ecdd914a26e056cf69207b4f50924ehkuang 250391037db265ecdd914a26e056cf69207b4f50924ehkuang switch (tx_type) { 250491037db265ecdd914a26e056cf69207b4f50924ehkuang case 0: // DCT_DCT 2505b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian idct16_sse2(in0, in1); 2506b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian idct16_sse2(in0, in1); 250791037db265ecdd914a26e056cf69207b4f50924ehkuang break; 250891037db265ecdd914a26e056cf69207b4f50924ehkuang case 1: // ADST_DCT 2509b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian idct16_sse2(in0, in1); 2510b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian iadst16_sse2(in0, in1); 251191037db265ecdd914a26e056cf69207b4f50924ehkuang break; 251291037db265ecdd914a26e056cf69207b4f50924ehkuang case 2: // DCT_ADST 2513b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian iadst16_sse2(in0, in1); 2514b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian idct16_sse2(in0, in1); 251591037db265ecdd914a26e056cf69207b4f50924ehkuang break; 251691037db265ecdd914a26e056cf69207b4f50924ehkuang case 3: // ADST_ADST 2517b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian iadst16_sse2(in0, in1); 2518b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian iadst16_sse2(in0, in1); 251991037db265ecdd914a26e056cf69207b4f50924ehkuang break; 252091037db265ecdd914a26e056cf69207b4f50924ehkuang default: 252191037db265ecdd914a26e056cf69207b4f50924ehkuang assert(0); 252291037db265ecdd914a26e056cf69207b4f50924ehkuang break; 252391037db265ecdd914a26e056cf69207b4f50924ehkuang } 252491037db265ecdd914a26e056cf69207b4f50924ehkuang 252591037db265ecdd914a26e056cf69207b4f50924ehkuang write_buffer_8x16(dest, in0, stride); 252691037db265ecdd914a26e056cf69207b4f50924ehkuang dest += 8; 252791037db265ecdd914a26e056cf69207b4f50924ehkuang write_buffer_8x16(dest, in1, stride); 252891037db265ecdd914a26e056cf69207b4f50924ehkuang} 252991037db265ecdd914a26e056cf69207b4f50924ehkuang 25305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, 25315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int stride) { 2532ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 2533ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i final_rounding = _mm_set1_epi16(1<<5); 2534ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i zero = _mm_setzero_si128(); 2535ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2536ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 2537ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 2538ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 2539ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 2540ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2541ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 2542ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 2543ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2544ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 2545ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 2546ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 2547ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 2548ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 2549ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); 2550ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2551ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 2552b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i in[16], l[16]; 2553b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, 2554ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 2555ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_8_0, stp1_12_0; 2556ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 2557b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14; 2558ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 2559ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang int i; 2560b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // First 1-D inverse DCT 2561b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Load input data. 2562b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[0] = _mm_load_si128((const __m128i *)input); 2563b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); 2564b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); 2565b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); 2566ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2567b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]); 2568ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2569ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage2 2570ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 2571b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero); 2572b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]); 2573ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2574ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); 2575ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); 2576ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); 2577ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); 2578ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2579ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_add_epi32(tmp0, rounding); 2580ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_add_epi32(tmp2, rounding); 2581ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp5 = _mm_add_epi32(tmp5, rounding); 2582ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp7 = _mm_add_epi32(tmp7, rounding); 2583ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2584ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2585ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2586ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); 2587ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); 2588ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2589b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_8 = _mm_packs_epi32(tmp0, tmp2); 2590b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_11 = _mm_packs_epi32(tmp5, tmp7); 2591ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 2592ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2593ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage3 2594ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 2595b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero); 2596ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2597ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); 2598ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); 2599ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2600ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_add_epi32(tmp0, rounding); 2601ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_add_epi32(tmp2, rounding); 2602ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2603ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2604ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2605b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_13 = _mm_unpackhi_epi64(stp2_11, zero); 2606b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_14 = _mm_unpackhi_epi64(stp2_8, zero); 2607ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2608b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_4 = _mm_packs_epi32(tmp0, tmp2); 2609ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 2610ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2611ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage4 2612ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 2613b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); 2614b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14); 2615b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13); 2616ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2617ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); 2618ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); 2619ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); 2620ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); 2621ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); 2622ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp7 = _mm_madd_epi16(lo_10_13, stg4_7); 2623ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2624ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_add_epi32(tmp0, rounding); 2625ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_add_epi32(tmp2, rounding); 2626ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_add_epi32(tmp1, rounding); 2627ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_add_epi32(tmp3, rounding); 2628ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp5 = _mm_add_epi32(tmp5, rounding); 2629ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp7 = _mm_add_epi32(tmp7, rounding); 2630ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2631ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2632ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2633ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); 2634ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 2635ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); 2636ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); 2637ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2638b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_0 = _mm_packs_epi32(tmp0, tmp0); 2639b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_1 = _mm_packs_epi32(tmp2, tmp2); 2640b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_9 = _mm_packs_epi32(tmp1, tmp3); 2641b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_10 = _mm_packs_epi32(tmp5, tmp7); 2642b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2643b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_6 = _mm_unpackhi_epi64(stp1_4, zero); 2644ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 2645ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2646ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage5 and Stage6 2647ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 2648b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp0 = _mm_add_epi16(stp2_8, stp2_11); 2649b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp1 = _mm_sub_epi16(stp2_8, stp2_11); 2650b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp2 = _mm_add_epi16(stp2_9, stp2_10); 2651b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp3 = _mm_sub_epi16(stp2_9, stp2_10); 2652b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2653b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_9 = _mm_unpacklo_epi64(tmp2, zero); 2654b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_10 = _mm_unpacklo_epi64(tmp3, zero); 2655b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_8 = _mm_unpacklo_epi64(tmp0, zero); 2656b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_11 = _mm_unpacklo_epi64(tmp1, zero); 2657b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2658b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_13 = _mm_unpackhi_epi64(tmp3, zero); 2659b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_14 = _mm_unpackhi_epi64(tmp2, zero); 2660b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_12 = _mm_unpackhi_epi64(tmp1, zero); 2661b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_15 = _mm_unpackhi_epi64(tmp0, zero); 2662ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 2663ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2664ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage6 2665ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 2666b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4); 2667ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); 2668ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); 2669ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2670ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_madd_epi16(lo_6_5, stg4_1); 2671ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_madd_epi16(lo_6_5, stg4_0); 2672ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); 2673ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); 2674ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); 2675ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); 2676ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2677ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_add_epi32(tmp1, rounding); 2678ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_add_epi32(tmp3, rounding); 2679ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_add_epi32(tmp0, rounding); 2680ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_add_epi32(tmp2, rounding); 2681ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_add_epi32(tmp4, rounding); 2682ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_add_epi32(tmp6, rounding); 2683ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2684ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); 2685ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 2686ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2687ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2688ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 2689ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 2690ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2691b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_6 = _mm_packs_epi32(tmp3, tmp1); 2692b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2693ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_10 = _mm_packs_epi32(tmp0, zero); 2694ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_13 = _mm_packs_epi32(tmp2, zero); 2695ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_11 = _mm_packs_epi32(tmp4, zero); 2696ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_12 = _mm_packs_epi32(tmp6, zero); 2697ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2698b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp0 = _mm_add_epi16(stp1_0, stp1_4); 2699b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp1 = _mm_sub_epi16(stp1_0, stp1_4); 2700b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp2 = _mm_add_epi16(stp1_1, stp1_6); 2701b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp3 = _mm_sub_epi16(stp1_1, stp1_6); 2702b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2703b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_0 = _mm_unpackhi_epi64(tmp0, zero); 2704b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_1 = _mm_unpacklo_epi64(tmp2, zero); 2705b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_2 = _mm_unpackhi_epi64(tmp2, zero); 2706b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_3 = _mm_unpacklo_epi64(tmp0, zero); 2707b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_4 = _mm_unpacklo_epi64(tmp1, zero); 2708b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_5 = _mm_unpackhi_epi64(tmp3, zero); 2709b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_6 = _mm_unpacklo_epi64(tmp3, zero); 2710b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_7 = _mm_unpackhi_epi64(tmp1, zero); 2711ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 2712ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2713ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage7. Left 8x16 only. 2714b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian l[0] = _mm_add_epi16(stp2_0, stp1_15); 2715b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian l[1] = _mm_add_epi16(stp2_1, stp1_14); 2716b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian l[2] = _mm_add_epi16(stp2_2, stp2_13); 2717b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian l[3] = _mm_add_epi16(stp2_3, stp2_12); 2718b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian l[4] = _mm_add_epi16(stp2_4, stp2_11); 2719b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian l[5] = _mm_add_epi16(stp2_5, stp2_10); 2720b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian l[6] = _mm_add_epi16(stp2_6, stp1_9); 2721b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian l[7] = _mm_add_epi16(stp2_7, stp1_8); 2722b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian l[8] = _mm_sub_epi16(stp2_7, stp1_8); 2723b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian l[9] = _mm_sub_epi16(stp2_6, stp1_9); 2724b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian l[10] = _mm_sub_epi16(stp2_5, stp2_10); 2725b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian l[11] = _mm_sub_epi16(stp2_4, stp2_11); 2726b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian l[12] = _mm_sub_epi16(stp2_3, stp2_12); 2727b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian l[13] = _mm_sub_epi16(stp2_2, stp2_13); 2728b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian l[14] = _mm_sub_epi16(stp2_1, stp1_14); 2729b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian l[15] = _mm_sub_epi16(stp2_0, stp1_15); 2730b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2731b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Second 1-D inverse transform, performed per 8x16 block 2732ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang for (i = 0; i < 2; i++) { 2733b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian array_transpose_4X8(l + 8*i, in); 2734ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2735b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian IDCT16_10 2736ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2737ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage7 2738b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[0] = _mm_add_epi16(stp2_0, stp1_15); 2739b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[1] = _mm_add_epi16(stp2_1, stp1_14); 2740b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[2] = _mm_add_epi16(stp2_2, stp2_13); 2741b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[3] = _mm_add_epi16(stp2_3, stp2_12); 2742b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[4] = _mm_add_epi16(stp2_4, stp2_11); 2743b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[5] = _mm_add_epi16(stp2_5, stp2_10); 2744b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[6] = _mm_add_epi16(stp2_6, stp1_9); 2745b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[7] = _mm_add_epi16(stp2_7, stp1_8); 2746b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[8] = _mm_sub_epi16(stp2_7, stp1_8); 2747b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[9] = _mm_sub_epi16(stp2_6, stp1_9); 2748b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[10] = _mm_sub_epi16(stp2_5, stp2_10); 2749b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[11] = _mm_sub_epi16(stp2_4, stp2_11); 2750b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[12] = _mm_sub_epi16(stp2_3, stp2_12); 2751b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[13] = _mm_sub_epi16(stp2_2, stp2_13); 2752b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[14] = _mm_sub_epi16(stp2_1, stp1_14); 2753b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[15] = _mm_sub_epi16(stp2_0, stp1_15); 2754ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2755ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Final rounding and shift 2756b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[0] = _mm_adds_epi16(in[0], final_rounding); 2757b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[1] = _mm_adds_epi16(in[1], final_rounding); 2758b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[2] = _mm_adds_epi16(in[2], final_rounding); 2759b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[3] = _mm_adds_epi16(in[3], final_rounding); 2760b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[4] = _mm_adds_epi16(in[4], final_rounding); 2761b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[5] = _mm_adds_epi16(in[5], final_rounding); 2762b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[6] = _mm_adds_epi16(in[6], final_rounding); 2763b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[7] = _mm_adds_epi16(in[7], final_rounding); 2764b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[8] = _mm_adds_epi16(in[8], final_rounding); 2765b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[9] = _mm_adds_epi16(in[9], final_rounding); 2766b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[10] = _mm_adds_epi16(in[10], final_rounding); 2767b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[11] = _mm_adds_epi16(in[11], final_rounding); 2768b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[12] = _mm_adds_epi16(in[12], final_rounding); 2769b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[13] = _mm_adds_epi16(in[13], final_rounding); 2770b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[14] = _mm_adds_epi16(in[14], final_rounding); 2771b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[15] = _mm_adds_epi16(in[15], final_rounding); 2772b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2773b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[0] = _mm_srai_epi16(in[0], 6); 2774b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[1] = _mm_srai_epi16(in[1], 6); 2775b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[2] = _mm_srai_epi16(in[2], 6); 2776b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[3] = _mm_srai_epi16(in[3], 6); 2777b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[4] = _mm_srai_epi16(in[4], 6); 2778b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[5] = _mm_srai_epi16(in[5], 6); 2779b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[6] = _mm_srai_epi16(in[6], 6); 2780b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[7] = _mm_srai_epi16(in[7], 6); 2781b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[8] = _mm_srai_epi16(in[8], 6); 2782b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[9] = _mm_srai_epi16(in[9], 6); 2783b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[10] = _mm_srai_epi16(in[10], 6); 2784b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[11] = _mm_srai_epi16(in[11], 6); 2785b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[12] = _mm_srai_epi16(in[12], 6); 2786b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[13] = _mm_srai_epi16(in[13], 6); 2787b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[14] = _mm_srai_epi16(in[14], 6); 2788b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[15] = _mm_srai_epi16(in[15], 6); 2789b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2790b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[0]); 2791b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[1]); 2792b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[2]); 2793b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[3]); 2794b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[4]); 2795b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[5]); 2796b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[6]); 2797b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[7]); 2798b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[8]); 2799b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[9]); 2800b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[10]); 2801b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[11]); 2802b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[12]); 2803b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[13]); 2804b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[14]); 2805b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[15]); 2806ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2807ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang dest += 8 - (stride * 16); 2808ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 2809ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang} 2810ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2811f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang#define LOAD_DQCOEFF(reg, input) \ 2812f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang { \ 28135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang reg = _mm_load_si128((const __m128i *) input); \ 2814f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang input += 8; \ 2815f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang } \ 2816f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang 2817b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#define IDCT32_34 \ 2818b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian/* Stage1 */ \ 2819b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian{ \ 2820b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i zero = _mm_setzero_si128();\ 2821b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \ 2822b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \ 2823b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2824b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \ 2825b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \ 2826b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2827b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \ 2828b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \ 2829b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2830b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \ 2831b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \ 2832b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2833b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \ 2834b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stg1_1, stp1_16, stp1_31); \ 2835b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \ 2836b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stg1_7, stp1_19, stp1_28); \ 2837b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \ 2838b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stg1_9, stp1_20, stp1_27); \ 2839b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \ 2840b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stg1_15, stp1_23, stp1_24); \ 2841b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} \ 2842b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian\ 2843b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian/* Stage2 */ \ 2844b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian{ \ 2845b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i zero = _mm_setzero_si128();\ 2846b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \ 2847b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \ 2848b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2849b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \ 2850b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \ 2851b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2852b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \ 2853b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stg2_1, stp2_8, stp2_15); \ 2854b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \ 2855b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stg2_7, stp2_11, stp2_12); \ 2856b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2857b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_16 = stp1_16; \ 2858b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_19 = stp1_19; \ 2859b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2860b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_20 = stp1_20; \ 2861b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_23 = stp1_23; \ 2862b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2863b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_24 = stp1_24; \ 2864b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_27 = stp1_27; \ 2865b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2866b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_28 = stp1_28; \ 2867b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_31 = stp1_31; \ 2868b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} \ 2869b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian\ 2870b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian/* Stage3 */ \ 2871b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian{ \ 2872b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i zero = _mm_setzero_si128();\ 2873b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \ 2874b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \ 2875b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2876b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \ 2877b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \ 2878b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \ 2879b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \ 2880b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2881b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \ 2882b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \ 2883b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \ 2884b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \ 2885b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2886b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \ 2887b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stg3_1, stp1_4, stp1_7); \ 2888b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2889b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_8 = stp2_8; \ 2890b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_11 = stp2_11; \ 2891b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_12 = stp2_12; \ 2892b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_15 = stp2_15; \ 2893b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2894b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ 2895b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ 2896b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_18, stp1_29) \ 2897b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ 2898b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ 2899b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_22, stp1_25) \ 2900b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2901b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_16 = stp2_16; \ 2902b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_31 = stp2_31; \ 2903b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_19 = stp2_19; \ 2904b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_20 = stp2_20; \ 2905b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_23 = stp2_23; \ 2906b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_24 = stp2_24; \ 2907b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_27 = stp2_27; \ 2908b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_28 = stp2_28; \ 2909b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} \ 2910b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian\ 2911b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian/* Stage4 */ \ 2912b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian{ \ 2913b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i zero = _mm_setzero_si128();\ 2914b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \ 2915b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \ 2916b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2917b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \ 2918b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \ 2919b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \ 2920b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \ 2921b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2922b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \ 2923b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stg4_1, stp2_0, stp2_1); \ 2924b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2925b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_4 = stp1_4; \ 2926b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_5 = stp1_4; \ 2927b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_6 = stp1_7; \ 2928b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_7 = stp1_7; \ 2929b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2930b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ 2931b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ 2932b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_10, stp2_13) \ 2933b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2934b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_8 = stp1_8; \ 2935b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_15 = stp1_15; \ 2936b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_11 = stp1_11; \ 2937b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_12 = stp1_12; \ 2938b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2939b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ 2940b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ 2941b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ 2942b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ 2943b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ 2944b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ 2945b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ 2946b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ 2947b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2948b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ 2949b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ 2950b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ 2951b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ 2952b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ 2953b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ 2954b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ 2955b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ 2956b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} \ 2957b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian\ 2958b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian/* Stage5 */ \ 2959b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian{ \ 2960b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 2961b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 2962b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ 2963b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ 2964b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2965b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ 2966b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ 2967b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ 2968b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ 2969b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2970b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 2971b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 2972b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2973b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_0 = stp2_0; \ 2974b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_1 = stp2_1; \ 2975b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_2 = stp2_1; \ 2976b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_3 = stp2_0; \ 2977b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2978b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 2979b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 2980b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 2981b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 2982b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2983b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp0 = _mm_add_epi32(tmp0, rounding); \ 2984b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp1 = _mm_add_epi32(tmp1, rounding); \ 2985b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp2 = _mm_add_epi32(tmp2, rounding); \ 2986b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp3 = _mm_add_epi32(tmp3, rounding); \ 2987b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2988b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 2989b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 2990b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 2991b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 2992b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2993b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 2994b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 2995b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2996b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_4 = stp2_4; \ 2997b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_7 = stp2_7; \ 2998b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 2999b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ 3000b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 3001b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 3002b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ 3003b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ 3004b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 3005b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 3006b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ 3007b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 3008b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_16 = stp2_16; \ 3009b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_17 = stp2_17; \ 3010b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 3011b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ 3012b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ 3013b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_19, stp1_28) \ 3014b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ 3015b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ 3016b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_21, stp1_26) \ 3017b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 3018b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_22 = stp2_22; \ 3019b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_23 = stp2_23; \ 3020b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_24 = stp2_24; \ 3021b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_25 = stp2_25; \ 3022b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_30 = stp2_30; \ 3023b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_31 = stp2_31; \ 3024b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} \ 3025b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian\ 3026b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian/* Stage6 */ \ 3027b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian{ \ 3028b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 3029b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 3030b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 3031b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 3032b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 3033b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ 3034b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 3035b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 3036b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ 3037b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ 3038b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 3039b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 3040b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ 3041b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 3042b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_8 = stp1_8; \ 3043b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_9 = stp1_9; \ 3044b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_14 = stp1_14; \ 3045b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_15 = stp1_15; \ 3046b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 3047b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ 3048b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ 3049b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_13, stp2_11, stp2_12) \ 3050b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 3051b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ 3052b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ 3053b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ 3054b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ 3055b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ 3056b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ 3057b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ 3058b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ 3059b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 3060b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ 3061b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ 3062b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ 3063b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ 3064b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ 3065b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ 3066b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ 3067b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ 3068b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} \ 3069b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian\ 3070b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian/* Stage7 */ \ 3071b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian{ \ 3072b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ 3073b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ 3074b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 3075b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 3076b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 3077b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ 3078b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ 3079b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ 3080b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ 3081b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 3082b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ 3083b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ 3084b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ 3085b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ 3086b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ 3087b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ 3088b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ 3089b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ 3090b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ 3091b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ 3092b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ 3093b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ 3094b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ 3095b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ 3096b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ 3097b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ 3098b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 3099b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_16 = stp2_16; \ 3100b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_17 = stp2_17; \ 3101b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_18 = stp2_18; \ 3102b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_19 = stp2_19; \ 3103b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 3104b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ 3105b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ 3106b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_21, stp1_26) \ 3107b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ 3108b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ 3109b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_23, stp1_24) \ 3110b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 3111b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_28 = stp2_28; \ 3112b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_29 = stp2_29; \ 3113b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_30 = stp2_30; \ 3114b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian stp1_31 = stp2_31; \ 3115b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 3116b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 3117b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 3118b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#define IDCT32 \ 31195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/* Stage1 */ \ 31205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang{ \ 3121b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \ 3122b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \ 3123b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \ 3124b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \ 3125b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 3126b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \ 3127b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \ 3128b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \ 3129b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \ 3130b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 3131b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \ 3132b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \ 3133b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \ 3134b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \ 3135b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian \ 3136b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \ 3137b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \ 3138b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \ 3139b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \ 31405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 31415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ 31425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \ 31435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_17, stp1_30) \ 31445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \ 31455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \ 31465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_19, stp1_28) \ 31475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ 31485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ 31495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_21, stp1_26) \ 31505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ 31515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ 31525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_23, stp1_24) \ 31535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} \ 31545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang\ 31555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/* Stage2 */ \ 31565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang{ \ 3157b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \ 3158b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \ 3159b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \ 3160b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \ 31615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 3162b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \ 3163b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \ 3164b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \ 3165b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \ 31665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 31675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ 31685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ 31695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_14) \ 31705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ 31715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \ 31725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_11, stp2_12) \ 31735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 31745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ 31755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ 31765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ 31775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ 31785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 31795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ 31805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ 31815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ 31825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ 31835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 31845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ 31855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ 31865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ 31875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ 31885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 31895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ 31905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ 31915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ 31925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ 31935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} \ 31945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang\ 31955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/* Stage3 */ \ 31965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang{ \ 3197b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \ 3198b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \ 3199b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \ 3200b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \ 32015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 32025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ 32035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ 32045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ 32055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ 32065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 32075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 32085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 32095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ 32105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ 32115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 32125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \ 32135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \ 32145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_6) \ 32155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 32165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ 32175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ 32185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ 32195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ 32205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ 32215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ 32225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ 32235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ 32245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 32255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ 32265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ 32275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_18, stp1_29) \ 32285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ 32295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ 32305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_22, stp1_25) \ 32315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 32325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_16 = stp2_16; \ 32335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_31 = stp2_31; \ 32345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_19 = stp2_19; \ 32355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_20 = stp2_20; \ 32365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_23 = stp2_23; \ 32375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_24 = stp2_24; \ 32385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_27 = stp2_27; \ 32395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_28 = stp2_28; \ 32405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} \ 32415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang\ 32425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/* Stage4 */ \ 32435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang{ \ 3244b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \ 3245b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \ 3246b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \ 3247b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \ 32485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 32495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ 32505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ 32515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 32525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 32535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 32545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \ 32555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \ 32565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_2, stp2_3) \ 32575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 32585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ 32595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ 32605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ 32615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ 32625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 32635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ 32645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ 32655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_10, stp2_13) \ 32665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 32675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_8 = stp1_8; \ 32685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_15 = stp1_15; \ 32695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_11 = stp1_11; \ 32705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_12 = stp1_12; \ 32715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 32725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ 32735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ 32745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ 32755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ 32765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ 32775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ 32785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ 32795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ 32805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 32815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ 32825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ 32835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ 32845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ 32855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ 32865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ 32875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ 32885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ 32895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} \ 32905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang\ 32915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/* Stage5 */ \ 32925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang{ \ 32935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 32945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 32955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ 32965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ 32975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 32985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ 32995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ 33005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ 33015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ 33025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 33035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 33045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 33055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 33065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ 33075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ 33085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ 33095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ 33105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 33115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 33125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 33135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 33145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 33155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 33165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang tmp0 = _mm_add_epi32(tmp0, rounding); \ 33175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang tmp1 = _mm_add_epi32(tmp1, rounding); \ 33185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang tmp2 = _mm_add_epi32(tmp2, rounding); \ 33195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang tmp3 = _mm_add_epi32(tmp3, rounding); \ 33205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 33215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 33225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 33235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 33245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 33255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 33265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 33275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 33285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 33295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_4 = stp2_4; \ 33305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_7 = stp2_7; \ 33315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 33325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ 33335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 33345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 33355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ 33365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ 33375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 33385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 33395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ 33405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 33415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_16 = stp2_16; \ 33425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_17 = stp2_17; \ 33435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 33445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ 33455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ 33465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_19, stp1_28) \ 33475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ 33485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ 33495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_21, stp1_26) \ 33505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 33515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_22 = stp2_22; \ 33525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_23 = stp2_23; \ 33535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_24 = stp2_24; \ 33545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_25 = stp2_25; \ 33555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_30 = stp2_30; \ 33565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_31 = stp2_31; \ 33575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} \ 33585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang\ 33595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/* Stage6 */ \ 33605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang{ \ 33615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 33625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 33635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 33645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 33655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 33665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ 33675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 33685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 33695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ 33705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ 33715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 33725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 33735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ 33745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 33755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_8 = stp1_8; \ 33765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_9 = stp1_9; \ 33775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_14 = stp1_14; \ 33785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_15 = stp1_15; \ 33795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 33805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ 33815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ 33825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_13, stp2_11, stp2_12) \ 33835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 33845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ 33855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ 33865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ 33875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ 33885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ 33895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ 33905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ 33915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ 33925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 33935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ 33945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ 33955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ 33965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ 33975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ 33985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ 33995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ 34005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ 34015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} \ 34025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang\ 34035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/* Stage7 */ \ 34045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang{ \ 34055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ 34065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ 34075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 34085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 34095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 34105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ 34115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ 34125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ 34135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ 34145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 34155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ 34165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ 34175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ 34185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ 34195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ 34205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ 34215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ 34225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ 34235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ 34245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ 34255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ 34265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ 34275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ 34285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ 34295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ 34305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ 34315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 34325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_16 = stp2_16; \ 34335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_17 = stp2_17; \ 34345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_18 = stp2_18; \ 34355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_19 = stp2_19; \ 34365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 34375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ 34385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ 34395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_21, stp1_26) \ 34405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ 34415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ 34425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_23, stp1_24) \ 34435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang \ 34445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_28 = stp2_28; \ 34455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_29 = stp2_29; \ 34465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_30 = stp2_30; \ 34475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_31 = stp2_31; \ 34485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} 34495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 34505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang// Only upper-left 8x8 has non-zero coeff 34515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, 34525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int stride) { 34535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 34545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i final_rounding = _mm_set1_epi16(1<<5); 34555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 34565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang // idct constants for each stage 34575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); 34585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); 34595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); 34605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); 34615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); 34625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); 34635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); 34645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); 34655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); 34665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); 34675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); 34685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); 34695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); 34705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); 34715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); 34725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); 34735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 34745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 34755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 34765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 34775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 34785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 34795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 34805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 34815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 34825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 34835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 34845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 34855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 34865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 34875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); 34885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); 34895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); 34905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); 34915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); 34925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); 34935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 34945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 34955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 34965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 34975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 34985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 34995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 35005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 35015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 35025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 35035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 3504b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i in[32], col[32]; 35055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 35065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 35075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, 35085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, 35095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp1_30, stp1_31; 35105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 35115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, 35125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, 35135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, 35145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang stp2_30, stp2_31; 35155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 3516b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian int i; 3517b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Load input data. 3518b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[0], input); 3519b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[8], input); 3520b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[16], input); 3521b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[24], input); 3522b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[1], input); 3523b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[9], input); 3524b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[17], input); 3525b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[25], input); 3526b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[2], input); 3527b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[10], input); 3528b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[18], input); 3529b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[26], input); 3530b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[3], input); 3531b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[11], input); 3532b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[19], input); 3533b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[27], input); 3534b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 3535b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[4], input); 3536b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[12], input); 3537b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[20], input); 3538b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[28], input); 3539b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[5], input); 3540b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[13], input); 3541b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[21], input); 3542b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[29], input); 3543b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[6], input); 3544b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[14], input); 3545b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[22], input); 3546b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[30], input); 3547b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[7], input); 3548b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[15], input); 3549b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[23], input); 3550b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[31], input); 35515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 3552b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian array_transpose_8x8(in, in); 3553b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian array_transpose_8x8(in+8, in+8); 3554b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian array_transpose_8x8(in+16, in+16); 3555b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian array_transpose_8x8(in+24, in+24); 3556b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 3557b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian IDCT32 3558b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 3559b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 1_D: Store 32 intermediate results for each 8x32 block. 3560b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[0] = _mm_add_epi16(stp1_0, stp1_31); 3561b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[1] = _mm_add_epi16(stp1_1, stp1_30); 3562b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[2] = _mm_add_epi16(stp1_2, stp1_29); 3563b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[3] = _mm_add_epi16(stp1_3, stp1_28); 3564b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[4] = _mm_add_epi16(stp1_4, stp1_27); 3565b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[5] = _mm_add_epi16(stp1_5, stp1_26); 3566b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[6] = _mm_add_epi16(stp1_6, stp1_25); 3567b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[7] = _mm_add_epi16(stp1_7, stp1_24); 3568b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[8] = _mm_add_epi16(stp1_8, stp1_23); 3569b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[9] = _mm_add_epi16(stp1_9, stp1_22); 3570b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[10] = _mm_add_epi16(stp1_10, stp1_21); 3571b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[11] = _mm_add_epi16(stp1_11, stp1_20); 3572b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[12] = _mm_add_epi16(stp1_12, stp1_19); 3573b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[13] = _mm_add_epi16(stp1_13, stp1_18); 3574b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[14] = _mm_add_epi16(stp1_14, stp1_17); 3575b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[15] = _mm_add_epi16(stp1_15, stp1_16); 3576b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[16] = _mm_sub_epi16(stp1_15, stp1_16); 3577b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[17] = _mm_sub_epi16(stp1_14, stp1_17); 3578b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[18] = _mm_sub_epi16(stp1_13, stp1_18); 3579b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[19] = _mm_sub_epi16(stp1_12, stp1_19); 3580b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[20] = _mm_sub_epi16(stp1_11, stp1_20); 3581b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[21] = _mm_sub_epi16(stp1_10, stp1_21); 3582b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[22] = _mm_sub_epi16(stp1_9, stp1_22); 3583b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[23] = _mm_sub_epi16(stp1_8, stp1_23); 3584b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[24] = _mm_sub_epi16(stp1_7, stp1_24); 3585b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[25] = _mm_sub_epi16(stp1_6, stp1_25); 3586b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[26] = _mm_sub_epi16(stp1_5, stp1_26); 3587b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[27] = _mm_sub_epi16(stp1_4, stp1_27); 3588b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[28] = _mm_sub_epi16(stp1_3, stp1_28); 3589b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[29] = _mm_sub_epi16(stp1_2, stp1_29); 3590b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[30] = _mm_sub_epi16(stp1_1, stp1_30); 3591b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian col[31] = _mm_sub_epi16(stp1_0, stp1_31); 3592b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian for (i = 0; i < 4; i++) { 35935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i zero = _mm_setzero_si128(); 3594b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Transpose 32x8 block to 8x32 block 3595b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian array_transpose_8x8(col+i*8, in); 3596b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian IDCT32_34 35975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 35985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang // 2_D: Calculate the results and store them to destination. 3599b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[0] = _mm_add_epi16(stp1_0, stp1_31); 3600b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[1] = _mm_add_epi16(stp1_1, stp1_30); 3601b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[2] = _mm_add_epi16(stp1_2, stp1_29); 3602b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[3] = _mm_add_epi16(stp1_3, stp1_28); 3603b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[4] = _mm_add_epi16(stp1_4, stp1_27); 3604b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[5] = _mm_add_epi16(stp1_5, stp1_26); 3605b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[6] = _mm_add_epi16(stp1_6, stp1_25); 3606b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[7] = _mm_add_epi16(stp1_7, stp1_24); 3607b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[8] = _mm_add_epi16(stp1_8, stp1_23); 3608b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[9] = _mm_add_epi16(stp1_9, stp1_22); 3609b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[10] = _mm_add_epi16(stp1_10, stp1_21); 3610b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[11] = _mm_add_epi16(stp1_11, stp1_20); 3611b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[12] = _mm_add_epi16(stp1_12, stp1_19); 3612b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[13] = _mm_add_epi16(stp1_13, stp1_18); 3613b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[14] = _mm_add_epi16(stp1_14, stp1_17); 3614b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[15] = _mm_add_epi16(stp1_15, stp1_16); 3615b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[16] = _mm_sub_epi16(stp1_15, stp1_16); 3616b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[17] = _mm_sub_epi16(stp1_14, stp1_17); 3617b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[18] = _mm_sub_epi16(stp1_13, stp1_18); 3618b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[19] = _mm_sub_epi16(stp1_12, stp1_19); 3619b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[20] = _mm_sub_epi16(stp1_11, stp1_20); 3620b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[21] = _mm_sub_epi16(stp1_10, stp1_21); 3621b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[22] = _mm_sub_epi16(stp1_9, stp1_22); 3622b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[23] = _mm_sub_epi16(stp1_8, stp1_23); 3623b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[24] = _mm_sub_epi16(stp1_7, stp1_24); 3624b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[25] = _mm_sub_epi16(stp1_6, stp1_25); 3625b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[26] = _mm_sub_epi16(stp1_5, stp1_26); 3626b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[27] = _mm_sub_epi16(stp1_4, stp1_27); 3627b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[28] = _mm_sub_epi16(stp1_3, stp1_28); 3628b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[29] = _mm_sub_epi16(stp1_2, stp1_29); 3629b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[30] = _mm_sub_epi16(stp1_1, stp1_30); 3630b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[31] = _mm_sub_epi16(stp1_0, stp1_31); 36315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 36325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang // Final rounding and shift 3633b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[0] = _mm_adds_epi16(in[0], final_rounding); 3634b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[1] = _mm_adds_epi16(in[1], final_rounding); 3635b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[2] = _mm_adds_epi16(in[2], final_rounding); 3636b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[3] = _mm_adds_epi16(in[3], final_rounding); 3637b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[4] = _mm_adds_epi16(in[4], final_rounding); 3638b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[5] = _mm_adds_epi16(in[5], final_rounding); 3639b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[6] = _mm_adds_epi16(in[6], final_rounding); 3640b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[7] = _mm_adds_epi16(in[7], final_rounding); 3641b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[8] = _mm_adds_epi16(in[8], final_rounding); 3642b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[9] = _mm_adds_epi16(in[9], final_rounding); 3643b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[10] = _mm_adds_epi16(in[10], final_rounding); 3644b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[11] = _mm_adds_epi16(in[11], final_rounding); 3645b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[12] = _mm_adds_epi16(in[12], final_rounding); 3646b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[13] = _mm_adds_epi16(in[13], final_rounding); 3647b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[14] = _mm_adds_epi16(in[14], final_rounding); 3648b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[15] = _mm_adds_epi16(in[15], final_rounding); 3649b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[16] = _mm_adds_epi16(in[16], final_rounding); 3650b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[17] = _mm_adds_epi16(in[17], final_rounding); 3651b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[18] = _mm_adds_epi16(in[18], final_rounding); 3652b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[19] = _mm_adds_epi16(in[19], final_rounding); 3653b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[20] = _mm_adds_epi16(in[20], final_rounding); 3654b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[21] = _mm_adds_epi16(in[21], final_rounding); 3655b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[22] = _mm_adds_epi16(in[22], final_rounding); 3656b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[23] = _mm_adds_epi16(in[23], final_rounding); 3657b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[24] = _mm_adds_epi16(in[24], final_rounding); 3658b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[25] = _mm_adds_epi16(in[25], final_rounding); 3659b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[26] = _mm_adds_epi16(in[26], final_rounding); 3660b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[27] = _mm_adds_epi16(in[27], final_rounding); 3661b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[28] = _mm_adds_epi16(in[28], final_rounding); 3662b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[29] = _mm_adds_epi16(in[29], final_rounding); 3663b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[30] = _mm_adds_epi16(in[30], final_rounding); 3664b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[31] = _mm_adds_epi16(in[31], final_rounding); 3665b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 3666b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[0] = _mm_srai_epi16(in[0], 6); 3667b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[1] = _mm_srai_epi16(in[1], 6); 3668b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[2] = _mm_srai_epi16(in[2], 6); 3669b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[3] = _mm_srai_epi16(in[3], 6); 3670b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[4] = _mm_srai_epi16(in[4], 6); 3671b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[5] = _mm_srai_epi16(in[5], 6); 3672b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[6] = _mm_srai_epi16(in[6], 6); 3673b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[7] = _mm_srai_epi16(in[7], 6); 3674b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[8] = _mm_srai_epi16(in[8], 6); 3675b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[9] = _mm_srai_epi16(in[9], 6); 3676b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[10] = _mm_srai_epi16(in[10], 6); 3677b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[11] = _mm_srai_epi16(in[11], 6); 3678b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[12] = _mm_srai_epi16(in[12], 6); 3679b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[13] = _mm_srai_epi16(in[13], 6); 3680b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[14] = _mm_srai_epi16(in[14], 6); 3681b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[15] = _mm_srai_epi16(in[15], 6); 3682b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[16] = _mm_srai_epi16(in[16], 6); 3683b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[17] = _mm_srai_epi16(in[17], 6); 3684b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[18] = _mm_srai_epi16(in[18], 6); 3685b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[19] = _mm_srai_epi16(in[19], 6); 3686b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[20] = _mm_srai_epi16(in[20], 6); 3687b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[21] = _mm_srai_epi16(in[21], 6); 3688b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[22] = _mm_srai_epi16(in[22], 6); 3689b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[23] = _mm_srai_epi16(in[23], 6); 3690b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[24] = _mm_srai_epi16(in[24], 6); 3691b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[25] = _mm_srai_epi16(in[25], 6); 3692b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[26] = _mm_srai_epi16(in[26], 6); 3693b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[27] = _mm_srai_epi16(in[27], 6); 3694b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[28] = _mm_srai_epi16(in[28], 6); 3695b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[29] = _mm_srai_epi16(in[29], 6); 3696b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[30] = _mm_srai_epi16(in[30], 6); 3697b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[31] = _mm_srai_epi16(in[31], 6); 3698b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 3699b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[0]); 3700b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[1]); 3701b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[2]); 3702b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[3]); 3703b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[4]); 3704b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[5]); 3705b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[6]); 3706b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[7]); 3707b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[8]); 3708b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[9]); 3709b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[10]); 3710b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[11]); 3711b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[12]); 3712b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[13]); 3713b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[14]); 3714b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[15]); 3715b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[16]); 3716b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[17]); 3717b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[18]); 3718b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[19]); 3719b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[20]); 3720b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[21]); 3721b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[22]); 3722b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[23]); 3723b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[24]); 3724b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[25]); 3725b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[26]); 3726b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[27]); 3727b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[28]); 3728b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[29]); 3729b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[30]); 3730b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[31]); 37315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 37325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dest += 8 - (stride * 32); 37335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 37345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 37355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 37365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, 37375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int stride) { 3738ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 3739ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i final_rounding = _mm_set1_epi16(1<<5); 3740ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3741ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // idct constants for each stage 3742ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); 3743ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); 3744ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); 3745ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); 3746ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); 3747ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); 3748ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); 3749ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); 3750ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); 3751ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); 3752ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); 3753ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); 3754ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); 3755ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); 3756ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); 3757ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); 3758ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3759ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 3760ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 3761ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 3762ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 3763ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 3764ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 3765ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 3766ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 3767ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3768ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 3769ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 3770ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 3771ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 3772ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); 3773ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); 3774ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); 3775ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); 3776ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); 3777ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); 3778ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3779ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 3780ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 3781ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 3782ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 3783ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 3784ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 3785ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 3786ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3787ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 3788ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3789b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i in[32], col[128], zero_idx[16]; 3790ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 3791ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 3792ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, 3793ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, 3794ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_30, stp1_31; 3795ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 3796ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, 3797ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, 3798ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, 3799ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_30, stp2_31; 3800ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 3801f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang int i, j, i32; 3802f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang int zero_flag[2]; 3803ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3804b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian for (i = 0; i < 4; i++) { 3805f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang i32 = (i << 5); 3806ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // First 1-D idct 3807ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Load input data. 3808b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[0], input); 3809b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[8], input); 3810b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[16], input); 3811b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[24], input); 3812b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[1], input); 3813b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[9], input); 3814b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[17], input); 3815b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[25], input); 3816b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[2], input); 3817b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[10], input); 3818b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[18], input); 3819b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[26], input); 3820b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[3], input); 3821b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[11], input); 3822b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[19], input); 3823b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[27], input); 3824b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 3825b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[4], input); 3826b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[12], input); 3827b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[20], input); 3828b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[28], input); 3829b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[5], input); 3830b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[13], input); 3831b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[21], input); 3832b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[29], input); 3833b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[6], input); 3834b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[14], input); 3835b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[22], input); 3836b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[30], input); 3837b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[7], input); 3838b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[15], input); 3839b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[23], input); 3840b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian LOAD_DQCOEFF(in[31], input); 3841f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang 3842f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang // checking if all entries are zero 3843b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian zero_idx[0] = _mm_or_si128(in[0], in[1]); 3844b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian zero_idx[1] = _mm_or_si128(in[2], in[3]); 3845b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian zero_idx[2] = _mm_or_si128(in[4], in[5]); 3846b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian zero_idx[3] = _mm_or_si128(in[6], in[7]); 3847b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian zero_idx[4] = _mm_or_si128(in[8], in[9]); 3848b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian zero_idx[5] = _mm_or_si128(in[10], in[11]); 3849b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian zero_idx[6] = _mm_or_si128(in[12], in[13]); 3850b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian zero_idx[7] = _mm_or_si128(in[14], in[15]); 3851b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian zero_idx[8] = _mm_or_si128(in[16], in[17]); 3852b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian zero_idx[9] = _mm_or_si128(in[18], in[19]); 3853b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian zero_idx[10] = _mm_or_si128(in[20], in[21]); 3854b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian zero_idx[11] = _mm_or_si128(in[22], in[23]); 3855b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian zero_idx[12] = _mm_or_si128(in[24], in[25]); 3856b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian zero_idx[13] = _mm_or_si128(in[26], in[27]); 3857b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian zero_idx[14] = _mm_or_si128(in[28], in[29]); 3858b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian zero_idx[15] = _mm_or_si128(in[30], in[31]); 3859f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang 3860f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); 3861f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); 3862f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); 3863f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); 3864f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); 3865f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); 3866f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); 3867f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); 3868f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang 3869f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); 3870f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); 3871f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); 3872f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); 3873f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); 3874f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); 3875f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); 3876f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang 3877f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang zero_idx[0] = _mm_unpackhi_epi64(zero_idx[14], zero_idx[14]); 3878f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang zero_idx[1] = _mm_or_si128(zero_idx[0], zero_idx[14]); 3879f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang zero_idx[2] = _mm_srli_epi64(zero_idx[1], 32); 3880f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang zero_flag[0] = _mm_cvtsi128_si32(zero_idx[1]); 3881f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang zero_flag[1] = _mm_cvtsi128_si32(zero_idx[2]); 3882f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang 3883f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang if (!zero_flag[0] && !zero_flag[1]) { 3884f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 0] = _mm_setzero_si128(); 3885f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 1] = _mm_setzero_si128(); 3886f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 2] = _mm_setzero_si128(); 3887f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 3] = _mm_setzero_si128(); 3888f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 4] = _mm_setzero_si128(); 3889f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 5] = _mm_setzero_si128(); 3890f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 6] = _mm_setzero_si128(); 3891f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 7] = _mm_setzero_si128(); 3892f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 8] = _mm_setzero_si128(); 3893f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 9] = _mm_setzero_si128(); 3894f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 10] = _mm_setzero_si128(); 3895f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 11] = _mm_setzero_si128(); 3896f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 12] = _mm_setzero_si128(); 3897f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 13] = _mm_setzero_si128(); 3898f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 14] = _mm_setzero_si128(); 3899f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 15] = _mm_setzero_si128(); 3900f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 16] = _mm_setzero_si128(); 3901f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 17] = _mm_setzero_si128(); 3902f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 18] = _mm_setzero_si128(); 3903f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 19] = _mm_setzero_si128(); 3904f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 20] = _mm_setzero_si128(); 3905f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 21] = _mm_setzero_si128(); 3906f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 22] = _mm_setzero_si128(); 3907f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 23] = _mm_setzero_si128(); 3908f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 24] = _mm_setzero_si128(); 3909f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 25] = _mm_setzero_si128(); 3910f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 26] = _mm_setzero_si128(); 3911f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 27] = _mm_setzero_si128(); 3912f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 28] = _mm_setzero_si128(); 3913f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 29] = _mm_setzero_si128(); 3914f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 30] = _mm_setzero_si128(); 3915f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 31] = _mm_setzero_si128(); 3916f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang continue; 3917f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang } 3918ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3919ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Transpose 32x8 block to 8x32 block 3920b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian array_transpose_8x8(in, in); 3921b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian array_transpose_8x8(in+8, in+8); 3922b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian array_transpose_8x8(in+16, in+16); 3923b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian array_transpose_8x8(in+24, in+24); 3924ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3925b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian IDCT32 3926ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3927ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 1_D: Store 32 intermediate results for each 8x32 block. 3928f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); 3929f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); 3930f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); 3931f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); 3932f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); 3933f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); 3934f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); 3935f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); 3936f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); 3937f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); 3938f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); 3939f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); 3940f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); 3941f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); 3942f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); 3943f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); 3944f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); 3945f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); 3946f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); 3947f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); 3948f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); 3949f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); 3950f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); 3951f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); 3952f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); 3953f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); 3954f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); 3955f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); 3956f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); 3957f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); 3958f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); 3959f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); 3960b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 3961b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian for (i = 0; i < 4; i++) { 3962ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i zero = _mm_setzero_si128(); 3963b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Second 1-D idct 3964b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian j = i << 3; 3965b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 3966b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Transpose 32x8 block to 8x32 block 3967b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian array_transpose_8x8(col+j, in); 3968b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian array_transpose_8x8(col+j+32, in+8); 3969b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian array_transpose_8x8(col+j+64, in+16); 3970b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian array_transpose_8x8(col+j+96, in+24); 3971b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 3972b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian IDCT32 3973ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3974ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 2_D: Calculate the results and store them to destination. 3975b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[0] = _mm_add_epi16(stp1_0, stp1_31); 3976b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[1] = _mm_add_epi16(stp1_1, stp1_30); 3977b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[2] = _mm_add_epi16(stp1_2, stp1_29); 3978b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[3] = _mm_add_epi16(stp1_3, stp1_28); 3979b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[4] = _mm_add_epi16(stp1_4, stp1_27); 3980b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[5] = _mm_add_epi16(stp1_5, stp1_26); 3981b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[6] = _mm_add_epi16(stp1_6, stp1_25); 3982b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[7] = _mm_add_epi16(stp1_7, stp1_24); 3983b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[8] = _mm_add_epi16(stp1_8, stp1_23); 3984b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[9] = _mm_add_epi16(stp1_9, stp1_22); 3985b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[10] = _mm_add_epi16(stp1_10, stp1_21); 3986b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[11] = _mm_add_epi16(stp1_11, stp1_20); 3987b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[12] = _mm_add_epi16(stp1_12, stp1_19); 3988b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[13] = _mm_add_epi16(stp1_13, stp1_18); 3989b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[14] = _mm_add_epi16(stp1_14, stp1_17); 3990b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[15] = _mm_add_epi16(stp1_15, stp1_16); 3991b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[16] = _mm_sub_epi16(stp1_15, stp1_16); 3992b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[17] = _mm_sub_epi16(stp1_14, stp1_17); 3993b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[18] = _mm_sub_epi16(stp1_13, stp1_18); 3994b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[19] = _mm_sub_epi16(stp1_12, stp1_19); 3995b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[20] = _mm_sub_epi16(stp1_11, stp1_20); 3996b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[21] = _mm_sub_epi16(stp1_10, stp1_21); 3997b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[22] = _mm_sub_epi16(stp1_9, stp1_22); 3998b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[23] = _mm_sub_epi16(stp1_8, stp1_23); 3999b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[24] = _mm_sub_epi16(stp1_7, stp1_24); 4000b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[25] = _mm_sub_epi16(stp1_6, stp1_25); 4001b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[26] = _mm_sub_epi16(stp1_5, stp1_26); 4002b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[27] = _mm_sub_epi16(stp1_4, stp1_27); 4003b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[28] = _mm_sub_epi16(stp1_3, stp1_28); 4004b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[29] = _mm_sub_epi16(stp1_2, stp1_29); 4005b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[30] = _mm_sub_epi16(stp1_1, stp1_30); 4006b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[31] = _mm_sub_epi16(stp1_0, stp1_31); 4007ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 4008ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Final rounding and shift 4009b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[0] = _mm_adds_epi16(in[0], final_rounding); 4010b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[1] = _mm_adds_epi16(in[1], final_rounding); 4011b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[2] = _mm_adds_epi16(in[2], final_rounding); 4012b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[3] = _mm_adds_epi16(in[3], final_rounding); 4013b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[4] = _mm_adds_epi16(in[4], final_rounding); 4014b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[5] = _mm_adds_epi16(in[5], final_rounding); 4015b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[6] = _mm_adds_epi16(in[6], final_rounding); 4016b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[7] = _mm_adds_epi16(in[7], final_rounding); 4017b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[8] = _mm_adds_epi16(in[8], final_rounding); 4018b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[9] = _mm_adds_epi16(in[9], final_rounding); 4019b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[10] = _mm_adds_epi16(in[10], final_rounding); 4020b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[11] = _mm_adds_epi16(in[11], final_rounding); 4021b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[12] = _mm_adds_epi16(in[12], final_rounding); 4022b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[13] = _mm_adds_epi16(in[13], final_rounding); 4023b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[14] = _mm_adds_epi16(in[14], final_rounding); 4024b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[15] = _mm_adds_epi16(in[15], final_rounding); 4025b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[16] = _mm_adds_epi16(in[16], final_rounding); 4026b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[17] = _mm_adds_epi16(in[17], final_rounding); 4027b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[18] = _mm_adds_epi16(in[18], final_rounding); 4028b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[19] = _mm_adds_epi16(in[19], final_rounding); 4029b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[20] = _mm_adds_epi16(in[20], final_rounding); 4030b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[21] = _mm_adds_epi16(in[21], final_rounding); 4031b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[22] = _mm_adds_epi16(in[22], final_rounding); 4032b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[23] = _mm_adds_epi16(in[23], final_rounding); 4033b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[24] = _mm_adds_epi16(in[24], final_rounding); 4034b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[25] = _mm_adds_epi16(in[25], final_rounding); 4035b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[26] = _mm_adds_epi16(in[26], final_rounding); 4036b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[27] = _mm_adds_epi16(in[27], final_rounding); 4037b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[28] = _mm_adds_epi16(in[28], final_rounding); 4038b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[29] = _mm_adds_epi16(in[29], final_rounding); 4039b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[30] = _mm_adds_epi16(in[30], final_rounding); 4040b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[31] = _mm_adds_epi16(in[31], final_rounding); 4041b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 4042b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[0] = _mm_srai_epi16(in[0], 6); 4043b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[1] = _mm_srai_epi16(in[1], 6); 4044b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[2] = _mm_srai_epi16(in[2], 6); 4045b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[3] = _mm_srai_epi16(in[3], 6); 4046b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[4] = _mm_srai_epi16(in[4], 6); 4047b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[5] = _mm_srai_epi16(in[5], 6); 4048b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[6] = _mm_srai_epi16(in[6], 6); 4049b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[7] = _mm_srai_epi16(in[7], 6); 4050b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[8] = _mm_srai_epi16(in[8], 6); 4051b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[9] = _mm_srai_epi16(in[9], 6); 4052b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[10] = _mm_srai_epi16(in[10], 6); 4053b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[11] = _mm_srai_epi16(in[11], 6); 4054b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[12] = _mm_srai_epi16(in[12], 6); 4055b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[13] = _mm_srai_epi16(in[13], 6); 4056b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[14] = _mm_srai_epi16(in[14], 6); 4057b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[15] = _mm_srai_epi16(in[15], 6); 4058b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[16] = _mm_srai_epi16(in[16], 6); 4059b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[17] = _mm_srai_epi16(in[17], 6); 4060b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[18] = _mm_srai_epi16(in[18], 6); 4061b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[19] = _mm_srai_epi16(in[19], 6); 4062b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[20] = _mm_srai_epi16(in[20], 6); 4063b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[21] = _mm_srai_epi16(in[21], 6); 4064b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[22] = _mm_srai_epi16(in[22], 6); 4065b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[23] = _mm_srai_epi16(in[23], 6); 4066b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[24] = _mm_srai_epi16(in[24], 6); 4067b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[25] = _mm_srai_epi16(in[25], 6); 4068b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[26] = _mm_srai_epi16(in[26], 6); 4069b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[27] = _mm_srai_epi16(in[27], 6); 4070b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[28] = _mm_srai_epi16(in[28], 6); 4071b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[29] = _mm_srai_epi16(in[29], 6); 4072b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[30] = _mm_srai_epi16(in[30], 6); 4073b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[31] = _mm_srai_epi16(in[31], 6); 4074b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 4075b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[0]); 4076b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[1]); 4077b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[2]); 4078b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[3]); 4079b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[4]); 4080b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[5]); 4081b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[6]); 4082b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[7]); 4083b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[8]); 4084b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[9]); 4085b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[10]); 4086b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[11]); 4087b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[12]); 4088b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[13]); 4089b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[14]); 4090b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[15]); 4091b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[16]); 4092b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[17]); 4093b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[18]); 4094b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[19]); 4095b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[20]); 4096b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[21]); 4097b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[22]); 4098b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[23]); 4099b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[24]); 4100b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[25]); 4101b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[26]); 4102b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[27]); 4103b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[28]); 4104b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[29]); 4105b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[30]); 4106b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian RECON_AND_STORE(dest, in[31]); 4107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 4108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang dest += 8 - (stride * 32); 4109ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 41105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} //NOLINT 41115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 41125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 41135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __m128i dc_value; 41145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const __m128i zero = _mm_setzero_si128(); 41155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int a, i; 41165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 41175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang a = dct_const_round_shift(input[0] * cospi_16_64); 41185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang a = dct_const_round_shift(a * cospi_16_64); 41195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang a = ROUND_POWER_OF_TWO(a, 6); 41205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 41215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dc_value = _mm_set1_epi16(a); 41225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 41235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (i = 0; i < 4; ++i) { 41245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang RECON_AND_STORE(dest, dc_value); 41565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dest += 8 - (stride * 32); 41575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 4158ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang} 4159