vp9_idct_intrin_sse2.c revision 91037db265ecdd914a26e056cf69207b4f50924e
1ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang/* 2ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * 4ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * Use of this source code is governed by a BSD-style license 5ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * that can be found in the LICENSE file in the root of the source 6ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * tree. An additional intellectual property rights grant can be found 7ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * in the file PATENTS. All contributing project authors may 8ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * be found in the AUTHORS file in the root of the source tree. 9ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang */ 10ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 11ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include <assert.h> 12ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include <emmintrin.h> // SSE2 13ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "./vpx_config.h" 14ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vpx/vpx_integer.h" 15ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp9/common/vp9_common.h" 16ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp9/common/vp9_idct.h" 17ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 18ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangvoid vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) { 19ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i zero = _mm_setzero_si128(); 20ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i eight = _mm_set1_epi16(8); 21ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, 22ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang (int16_t)cospi_16_64, (int16_t)-cospi_16_64, 23ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang (int16_t)cospi_24_64, (int16_t)-cospi_8_64, 24ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang (int16_t)cospi_8_64, (int16_t)cospi_24_64); 25ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 26ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i input0, input1, input2, input3; 27ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 28ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Rows 29ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input0 = _mm_loadl_epi64((__m128i *)input); 30ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input1 = _mm_loadl_epi64((__m128i *)(input + 4)); 31ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input2 = _mm_loadl_epi64((__m128i *)(input + 8)); 32ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input3 = _mm_loadl_epi64((__m128i *)(input + 12)); 33ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 34ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Construct i3, i1, i3, i1, i2, i0, i2, i0 35ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input0 = _mm_shufflelo_epi16(input0, 0xd8); 36ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input1 = _mm_shufflelo_epi16(input1, 0xd8); 37ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input2 = _mm_shufflelo_epi16(input2, 0xd8); 38ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input3 = _mm_shufflelo_epi16(input3, 0xd8); 39ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 40ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input0 = _mm_unpacklo_epi32(input0, input0); 41ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input1 = _mm_unpacklo_epi32(input1, input1); 42ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input2 = _mm_unpacklo_epi32(input2, input2); 43ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input3 = _mm_unpacklo_epi32(input3, input3); 44ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 45ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage 1 46ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input0 = _mm_madd_epi16(input0, cst); 47ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input1 = _mm_madd_epi16(input1, cst); 48ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input2 = _mm_madd_epi16(input2, cst); 49ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input3 = _mm_madd_epi16(input3, cst); 50ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 51ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input0 = _mm_add_epi32(input0, rounding); 52ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input1 = _mm_add_epi32(input1, rounding); 53ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input2 = _mm_add_epi32(input2, rounding); 54ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input3 = _mm_add_epi32(input3, rounding); 55ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 56ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); 57ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); 58ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); 59ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); 60ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 61ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage 2 62ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input0 = _mm_packs_epi32(input0, zero); 63ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input1 = _mm_packs_epi32(input1, zero); 64ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input2 = _mm_packs_epi32(input2, zero); 65ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input3 = _mm_packs_epi32(input3, zero); 66ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 67ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Transpose 68ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input1 = _mm_unpacklo_epi16(input0, input1); 69ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input3 = _mm_unpacklo_epi16(input2, input3); 70ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input0 = _mm_unpacklo_epi32(input1, input3); 71ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input1 = _mm_unpackhi_epi32(input1, input3); 72ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 73ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Switch column2, column 3, and then, we got: 74ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // input2: column1, column 0; input3: column2, column 3. 75ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input1 = _mm_shuffle_epi32(input1, 0x4e); 76ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input2 = _mm_add_epi16(input0, input1); 77ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input3 = _mm_sub_epi16(input0, input1); 78ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 79ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Columns 80ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Construct i3, i1, i3, i1, i2, i0, i2, i0 81ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input0 = _mm_shufflelo_epi16(input2, 0xd8); 82ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input1 = _mm_shufflehi_epi16(input2, 0xd8); 83ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input2 = _mm_shufflehi_epi16(input3, 0xd8); 84ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input3 = _mm_shufflelo_epi16(input3, 0xd8); 85ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 86ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input0 = _mm_unpacklo_epi32(input0, input0); 87ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input1 = _mm_unpackhi_epi32(input1, input1); 88ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input2 = _mm_unpackhi_epi32(input2, input2); 89ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input3 = _mm_unpacklo_epi32(input3, input3); 90ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 91ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage 1 92ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input0 = _mm_madd_epi16(input0, cst); 93ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input1 = _mm_madd_epi16(input1, cst); 94ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input2 = _mm_madd_epi16(input2, cst); 95ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input3 = _mm_madd_epi16(input3, cst); 96ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 97ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input0 = _mm_add_epi32(input0, rounding); 98ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input1 = _mm_add_epi32(input1, rounding); 99ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input2 = _mm_add_epi32(input2, rounding); 100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input3 = _mm_add_epi32(input3, rounding); 101ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 102ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); 103ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); 104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); 105ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); 106ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage 2 108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input0 = _mm_packs_epi32(input0, zero); 109ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input1 = _mm_packs_epi32(input1, zero); 110ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input2 = _mm_packs_epi32(input2, zero); 111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input3 = _mm_packs_epi32(input3, zero); 112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 113ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Transpose 114ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input1 = _mm_unpacklo_epi16(input0, input1); 115ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input3 = _mm_unpacklo_epi16(input2, input3); 116ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input0 = _mm_unpacklo_epi32(input1, input3); 117ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input1 = _mm_unpackhi_epi32(input1, input3); 118ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 119ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Switch column2, column 3, and then, we got: 120ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // input2: column1, column 0; input3: column2, column 3. 121ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input1 = _mm_shuffle_epi32(input1, 0x4e); 122ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input2 = _mm_add_epi16(input0, input1); 123ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input3 = _mm_sub_epi16(input0, input1); 124ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 125ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Final round and shift 126ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input2 = _mm_add_epi16(input2, eight); 127ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input3 = _mm_add_epi16(input3, eight); 128ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 129ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input2 = _mm_srai_epi16(input2, 4); 130ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input3 = _mm_srai_epi16(input3, 4); 131ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 132ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define RECON_AND_STORE4X4(dest, in_x) \ 133ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { \ 134ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ 135ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang d0 = _mm_unpacklo_epi8(d0, zero); \ 136ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang d0 = _mm_add_epi16(in_x, d0); \ 137ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang d0 = _mm_packus_epi16(d0, d0); \ 138ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *(int *)dest = _mm_cvtsi128_si32(d0); \ 139ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang dest += stride; \ 140ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 141ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 142ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input0 = _mm_srli_si128(input2, 8); 143ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input1 = _mm_srli_si128(input3, 8); 144ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 145ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE4X4(dest, input2); 146ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE4X4(dest, input0); 147ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE4X4(dest, input1); 148ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE4X4(dest, input3); 149ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang} 150ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 15191037db265ecdd914a26e056cf69207b4f50924ehkuangvoid vp9_short_idct4x4_1_add_sse2(int16_t *input, uint8_t *dest, int stride) { 15291037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i dc_value; 15391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i zero = _mm_setzero_si128(); 15491037db265ecdd914a26e056cf69207b4f50924ehkuang int a; 15591037db265ecdd914a26e056cf69207b4f50924ehkuang 15691037db265ecdd914a26e056cf69207b4f50924ehkuang a = dct_const_round_shift(input[0] * cospi_16_64); 15791037db265ecdd914a26e056cf69207b4f50924ehkuang a = dct_const_round_shift(a * cospi_16_64); 15891037db265ecdd914a26e056cf69207b4f50924ehkuang a = ROUND_POWER_OF_TWO(a, 4); 15991037db265ecdd914a26e056cf69207b4f50924ehkuang 16091037db265ecdd914a26e056cf69207b4f50924ehkuang dc_value = _mm_set1_epi16(a); 16191037db265ecdd914a26e056cf69207b4f50924ehkuang 16291037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE4X4(dest, dc_value); 16391037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE4X4(dest, dc_value); 16491037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE4X4(dest, dc_value); 16591037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE4X4(dest, dc_value); 16691037db265ecdd914a26e056cf69207b4f50924ehkuang} 16791037db265ecdd914a26e056cf69207b4f50924ehkuang 168ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangvoid vp9_idct4_1d_sse2(int16_t *input, int16_t *output) { 169ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i zero = _mm_setzero_si128(); 170ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i c1 = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, 171ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang (int16_t)cospi_16_64, (int16_t)-cospi_16_64, 172ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang (int16_t)cospi_24_64, (int16_t)-cospi_8_64, 173ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang (int16_t)cospi_8_64, (int16_t)cospi_24_64); 174ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i c2 = _mm_setr_epi16(1, 1, 1, 1, 1, -1, 1, -1); 175ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 176ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 177ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i in, temp; 178ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 179ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Load input data. 180ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in = _mm_loadl_epi64((__m128i *)input); 181ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 182ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Construct i3, i1, i3, i1, i2, i0, i2, i0 183ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in = _mm_shufflelo_epi16(in, 0xd8); 184ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in = _mm_unpacklo_epi32(in, in); 185ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 186ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage 1 187ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in = _mm_madd_epi16(in, c1); 188ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in = _mm_add_epi32(in, rounding); 189ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in = _mm_srai_epi32(in, DCT_CONST_BITS); 190ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in = _mm_packs_epi32(in, zero); 191ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 192ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage 2 193ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang temp = _mm_shufflelo_epi16(in, 0x9c); 194ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in = _mm_shufflelo_epi16(in, 0xc9); 195ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in = _mm_unpacklo_epi64(temp, in); 196ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in = _mm_madd_epi16(in, c2); 197ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in = _mm_packs_epi32(in, zero); 198ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 199ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Store results 200ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang _mm_storel_epi64((__m128i *)output, in); 201ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang} 202ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 20391037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void transpose_4x4(__m128i *res) { 20491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); 20591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]); 20691037db265ecdd914a26e056cf69207b4f50924ehkuang res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); 20791037db265ecdd914a26e056cf69207b4f50924ehkuang res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); 20891037db265ecdd914a26e056cf69207b4f50924ehkuang 20991037db265ecdd914a26e056cf69207b4f50924ehkuang res[1] = _mm_unpackhi_epi64(res[0], res[0]); 21091037db265ecdd914a26e056cf69207b4f50924ehkuang res[3] = _mm_unpackhi_epi64(res[2], res[2]); 21191037db265ecdd914a26e056cf69207b4f50924ehkuang} 21291037db265ecdd914a26e056cf69207b4f50924ehkuang 21391037db265ecdd914a26e056cf69207b4f50924ehkuangvoid idct4_1d_sse2(__m128i *in) { 21491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); 21591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 21691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 21791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 21891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 21991037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i u[8], v[8]; 22091037db265ecdd914a26e056cf69207b4f50924ehkuang 22191037db265ecdd914a26e056cf69207b4f50924ehkuang transpose_4x4(in); 22291037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 1 22391037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_unpacklo_epi16(in[0], in[2]); 22491037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_unpacklo_epi16(in[1], in[3]); 22591037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 22691037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 22791037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); 22891037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24); 22991037db265ecdd914a26e056cf69207b4f50924ehkuang 23091037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 23191037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 23291037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 23391037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 23491037db265ecdd914a26e056cf69207b4f50924ehkuang 23591037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 23691037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 23791037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 23891037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 23991037db265ecdd914a26e056cf69207b4f50924ehkuang 24091037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_packs_epi32(v[0], v[2]); 24191037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_packs_epi32(v[1], v[3]); 24291037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_unpackhi_epi64(u[0], u[0]); 24391037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_unpackhi_epi64(u[1], u[1]); 24491037db265ecdd914a26e056cf69207b4f50924ehkuang 24591037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 2 24691037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_add_epi16(u[0], u[3]); 24791037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_add_epi16(u[1], u[2]); 24891037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = _mm_sub_epi16(u[1], u[2]); 24991037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_sub_epi16(u[0], u[3]); 25091037db265ecdd914a26e056cf69207b4f50924ehkuang} 25191037db265ecdd914a26e056cf69207b4f50924ehkuang 25291037db265ecdd914a26e056cf69207b4f50924ehkuangvoid iadst4_1d_sse2(__m128i *in) { 25391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); 25491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); 25591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); 25691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); 25791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9); 25891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i kZero = _mm_set1_epi16(0); 25991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 26091037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i u[8], v[8], in7; 26191037db265ecdd914a26e056cf69207b4f50924ehkuang 26291037db265ecdd914a26e056cf69207b4f50924ehkuang transpose_4x4(in); 26391037db265ecdd914a26e056cf69207b4f50924ehkuang in7 = _mm_add_epi16(in[0], in[3]); 26491037db265ecdd914a26e056cf69207b4f50924ehkuang in7 = _mm_sub_epi16(in7, in[2]); 26591037db265ecdd914a26e056cf69207b4f50924ehkuang 26691037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_unpacklo_epi16(in[0], in[2]); 26791037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_unpacklo_epi16(in[1], in[3]); 26891037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_unpacklo_epi16(in7, kZero); 26991037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_unpacklo_epi16(in[1], kZero); 27091037db265ecdd914a26e056cf69207b4f50924ehkuang 27191037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 27291037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 27391037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2 27491037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4 27591037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6 27691037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2 27791037db265ecdd914a26e056cf69207b4f50924ehkuang 27891037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], v[1]); 27991037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(v[3], v[4]); 28091037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = v[2]; 28191037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(u[0], u[1]); 28291037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_slli_epi32(v[5], 2); 28391037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_add_epi32(u[3], v[5]); 28491037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_sub_epi32(u[5], u[4]); 28591037db265ecdd914a26e056cf69207b4f50924ehkuang 28691037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 28791037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 28891037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 28991037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 29091037db265ecdd914a26e056cf69207b4f50924ehkuang 29191037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 29291037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 29391037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 29491037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 29591037db265ecdd914a26e056cf69207b4f50924ehkuang 29691037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_packs_epi32(u[0], u[2]); 29791037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_packs_epi32(u[1], u[3]); 29891037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = _mm_unpackhi_epi64(in[0], in[0]); 29991037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_unpackhi_epi64(in[1], in[1]); 30091037db265ecdd914a26e056cf69207b4f50924ehkuang} 30191037db265ecdd914a26e056cf69207b4f50924ehkuang 30291037db265ecdd914a26e056cf69207b4f50924ehkuangvoid vp9_short_iht4x4_add_sse2(int16_t *input, uint8_t *dest, int stride, 30391037db265ecdd914a26e056cf69207b4f50924ehkuang int tx_type) { 30491037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in[4]; 30591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i zero = _mm_setzero_si128(); 30691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i eight = _mm_set1_epi16(8); 30791037db265ecdd914a26e056cf69207b4f50924ehkuang 30891037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_loadl_epi64((__m128i *)input); 30991037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_loadl_epi64((__m128i *)(input + 4)); 31091037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = _mm_loadl_epi64((__m128i *)(input + 8)); 31191037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_loadl_epi64((__m128i *)(input + 12)); 31291037db265ecdd914a26e056cf69207b4f50924ehkuang 31391037db265ecdd914a26e056cf69207b4f50924ehkuang switch (tx_type) { 31491037db265ecdd914a26e056cf69207b4f50924ehkuang case 0: // DCT_DCT 31591037db265ecdd914a26e056cf69207b4f50924ehkuang idct4_1d_sse2(in); 31691037db265ecdd914a26e056cf69207b4f50924ehkuang idct4_1d_sse2(in); 31791037db265ecdd914a26e056cf69207b4f50924ehkuang break; 31891037db265ecdd914a26e056cf69207b4f50924ehkuang case 1: // ADST_DCT 31991037db265ecdd914a26e056cf69207b4f50924ehkuang idct4_1d_sse2(in); 32091037db265ecdd914a26e056cf69207b4f50924ehkuang iadst4_1d_sse2(in); 32191037db265ecdd914a26e056cf69207b4f50924ehkuang break; 32291037db265ecdd914a26e056cf69207b4f50924ehkuang case 2: // DCT_ADST 32391037db265ecdd914a26e056cf69207b4f50924ehkuang iadst4_1d_sse2(in); 32491037db265ecdd914a26e056cf69207b4f50924ehkuang idct4_1d_sse2(in); 32591037db265ecdd914a26e056cf69207b4f50924ehkuang break; 32691037db265ecdd914a26e056cf69207b4f50924ehkuang case 3: // ADST_ADST 32791037db265ecdd914a26e056cf69207b4f50924ehkuang iadst4_1d_sse2(in); 32891037db265ecdd914a26e056cf69207b4f50924ehkuang iadst4_1d_sse2(in); 32991037db265ecdd914a26e056cf69207b4f50924ehkuang break; 33091037db265ecdd914a26e056cf69207b4f50924ehkuang default: 33191037db265ecdd914a26e056cf69207b4f50924ehkuang assert(0); 33291037db265ecdd914a26e056cf69207b4f50924ehkuang break; 33391037db265ecdd914a26e056cf69207b4f50924ehkuang } 33491037db265ecdd914a26e056cf69207b4f50924ehkuang 33591037db265ecdd914a26e056cf69207b4f50924ehkuang // Final round and shift 33691037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_add_epi16(in[0], eight); 33791037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_add_epi16(in[1], eight); 33891037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = _mm_add_epi16(in[2], eight); 33991037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_add_epi16(in[3], eight); 34091037db265ecdd914a26e056cf69207b4f50924ehkuang 34191037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_srai_epi16(in[0], 4); 34291037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_srai_epi16(in[1], 4); 34391037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = _mm_srai_epi16(in[2], 4); 34491037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_srai_epi16(in[3], 4); 34591037db265ecdd914a26e056cf69207b4f50924ehkuang 34691037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE4X4(dest, in[0]); 34791037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE4X4(dest, in[1]); 34891037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE4X4(dest, in[2]); 34991037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE4X4(dest, in[3]); 35091037db265ecdd914a26e056cf69207b4f50924ehkuang} 35191037db265ecdd914a26e056cf69207b4f50924ehkuang 352ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ 353ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out0, out1, out2, out3, out4, out5, out6, out7) \ 354ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { \ 355ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ 356ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ 357ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ 358ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ 359ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ 360ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ 361ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ 362ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ 363ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 364ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ 365ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ 366ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ 367ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ 368ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ 369ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ 370ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ 371ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ 372ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 373ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ 374ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ 375ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ 376ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ 377ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ 378ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ 379ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ 380ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ 381ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 382ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 383ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, \ 384ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out0, out1, out2, out3, out4, out5, out6, out7) \ 385ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { \ 386ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ 387ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ 388ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ 389ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ 390ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 391ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ 392ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ 393ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ 394ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ 395ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 396ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ 397ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ 398ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ 399ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ 400ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang out4 = out5 = out6 = out7 = zero; \ 401ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 402ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 403ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1, out2, out3) \ 404ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { \ 405ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ 406ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ 407ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ 408ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ 409ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 410ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ 411ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ 412ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_unpacklo_epi32(tr0_2, tr0_3); /* i5 i4 */ \ 413ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_unpackhi_epi32(tr0_2, tr0_3); /* i7 i6 */ \ 414ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 415ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 416ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang// Define Macro for multiplying elements by constants and adding them together. 417ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \ 418ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang cst0, cst1, cst2, cst3, res0, res1, res2, res3) \ 419ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { \ 420ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_madd_epi16(lo_0, cst0); \ 421ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_madd_epi16(hi_0, cst0); \ 422ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_madd_epi16(lo_0, cst1); \ 423ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_madd_epi16(hi_0, cst1); \ 424ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_madd_epi16(lo_1, cst2); \ 425ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp5 = _mm_madd_epi16(hi_1, cst2); \ 426ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_madd_epi16(lo_1, cst3); \ 427ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp7 = _mm_madd_epi16(hi_1, cst3); \ 428ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 429ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_add_epi32(tmp0, rounding); \ 430ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_add_epi32(tmp1, rounding); \ 431ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_add_epi32(tmp2, rounding); \ 432ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_add_epi32(tmp3, rounding); \ 433ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_add_epi32(tmp4, rounding); \ 434ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp5 = _mm_add_epi32(tmp5, rounding); \ 435ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_add_epi32(tmp6, rounding); \ 436ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp7 = _mm_add_epi32(tmp7, rounding); \ 437ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 438ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 439ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 440ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 441ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 442ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ 443ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ 444ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ 445ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ 446ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 447ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res0 = _mm_packs_epi32(tmp0, tmp1); \ 448ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res1 = _mm_packs_epi32(tmp2, tmp3); \ 449ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res2 = _mm_packs_epi32(tmp4, tmp5); \ 450ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang res3 = _mm_packs_epi32(tmp6, tmp7); \ 451ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 452ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 453ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define IDCT8x8_1D \ 454ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* Stage1 */ \ 455ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { \ 456ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ 457ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ 458ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ 459ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ 460ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 461ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \ 462ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg1_1, stg1_2, stg1_3, stp1_4, \ 463ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_7, stp1_5, stp1_6) \ 464ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } \ 465ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 466ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* Stage2 */ \ 467ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { \ 468ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ 469ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ 470ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ 471ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ 472ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 473ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \ 474ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg2_1, stg2_2, stg2_3, stp2_0, \ 475ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_1, stp2_2, stp2_3) \ 476ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 477ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ 478ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ 479ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ 480ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ 481ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } \ 482ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 483ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* Stage3 */ \ 484ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { \ 485ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 486ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 487ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 488ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ 489ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ 490ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ 491ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ 492ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 493ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ 494ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ 495ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ 496ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ 497ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 498ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_add_epi32(tmp0, rounding); \ 499ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_add_epi32(tmp1, rounding); \ 500ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_add_epi32(tmp2, rounding); \ 501ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_add_epi32(tmp3, rounding); \ 502ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 503ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 504ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 505ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 506ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 507ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 508ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 509ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 510ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } \ 511ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 512ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* Stage4 */ \ 513ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_adds_epi16(stp1_0, stp2_7); \ 514ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_adds_epi16(stp1_1, stp1_6); \ 515ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_adds_epi16(stp1_2, stp1_5); \ 516ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_adds_epi16(stp1_3, stp2_4); \ 517ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4 = _mm_subs_epi16(stp1_3, stp2_4); \ 518ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in5 = _mm_subs_epi16(stp1_2, stp1_5); \ 519ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in6 = _mm_subs_epi16(stp1_1, stp1_6); \ 520ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in7 = _mm_subs_epi16(stp1_0, stp2_7); 521ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 522ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define RECON_AND_STORE(dest, in_x) \ 523ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { \ 524ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ 525ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang d0 = _mm_unpacklo_epi8(d0, zero); \ 526ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in_x = _mm_add_epi16(in_x, d0); \ 527ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in_x = _mm_packus_epi16(in_x, in_x); \ 528ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang _mm_storel_epi64((__m128i *)(dest), in_x); \ 529ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang dest += stride; \ 530ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 531ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 532ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangvoid vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { 533ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i zero = _mm_setzero_si128(); 534ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 535ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i final_rounding = _mm_set1_epi16(1<<4); 536ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 537ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 538ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 539ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 540ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 541ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 542ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 543ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 544ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 545ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i in0, in1, in2, in3, in4, in5, in6, in7; 546ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 547ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 548ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 549ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang int i; 550ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 551ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Load input data. 552ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_load_si128((__m128i *)input); 553ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_load_si128((__m128i *)(input + 8 * 1)); 554ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_load_si128((__m128i *)(input + 8 * 2)); 555ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_load_si128((__m128i *)(input + 8 * 3)); 556ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4 = _mm_load_si128((__m128i *)(input + 8 * 4)); 557ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in5 = _mm_load_si128((__m128i *)(input + 8 * 5)); 558ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in6 = _mm_load_si128((__m128i *)(input + 8 * 6)); 559ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in7 = _mm_load_si128((__m128i *)(input + 8 * 7)); 560ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 561ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 2-D 562ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang for (i = 0; i < 2; i++) { 563ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 8x8 Transpose is copied from vp9_short_fdct8x8_sse2() 564ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 565ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4, in5, in6, in7); 566ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 567ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 4-stage 1D idct8x8 568ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang IDCT8x8_1D 569ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 570ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 571ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Final rounding and shift 572ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_adds_epi16(in0, final_rounding); 573ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_adds_epi16(in1, final_rounding); 574ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_adds_epi16(in2, final_rounding); 575ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_adds_epi16(in3, final_rounding); 576ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4 = _mm_adds_epi16(in4, final_rounding); 577ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in5 = _mm_adds_epi16(in5, final_rounding); 578ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in6 = _mm_adds_epi16(in6, final_rounding); 579ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in7 = _mm_adds_epi16(in7, final_rounding); 580ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 581ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_srai_epi16(in0, 5); 582ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_srai_epi16(in1, 5); 583ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_srai_epi16(in2, 5); 584ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_srai_epi16(in3, 5); 585ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4 = _mm_srai_epi16(in4, 5); 586ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in5 = _mm_srai_epi16(in5, 5); 587ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in6 = _mm_srai_epi16(in6, 5); 588ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in7 = _mm_srai_epi16(in7, 5); 589ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 590ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in0); 591ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in1); 592ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in2); 593ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in3); 594ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in4); 595ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in5); 596ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in6); 597ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in7); 598ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang} 599ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 60091037db265ecdd914a26e056cf69207b4f50924ehkuang// perform 8x8 transpose 60191037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { 60291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); 60391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); 60491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); 60591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); 60691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); 60791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); 60891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); 60991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); 61091037db265ecdd914a26e056cf69207b4f50924ehkuang 61191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 61291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); 61391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 61491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); 61591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); 61691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 61791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); 61891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 61991037db265ecdd914a26e056cf69207b4f50924ehkuang 62091037db265ecdd914a26e056cf69207b4f50924ehkuang res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); 62191037db265ecdd914a26e056cf69207b4f50924ehkuang res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); 62291037db265ecdd914a26e056cf69207b4f50924ehkuang res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); 62391037db265ecdd914a26e056cf69207b4f50924ehkuang res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); 62491037db265ecdd914a26e056cf69207b4f50924ehkuang res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); 62591037db265ecdd914a26e056cf69207b4f50924ehkuang res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); 62691037db265ecdd914a26e056cf69207b4f50924ehkuang res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); 62791037db265ecdd914a26e056cf69207b4f50924ehkuang res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); 62891037db265ecdd914a26e056cf69207b4f50924ehkuang} 62991037db265ecdd914a26e056cf69207b4f50924ehkuang 63091037db265ecdd914a26e056cf69207b4f50924ehkuangvoid idct8_1d_sse2(__m128i *in) { 63191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 63291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 63391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 63491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 63591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 63691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 63791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 63891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 63991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 64091037db265ecdd914a26e056cf69207b4f50924ehkuang 64191037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in0, in1, in2, in3, in4, in5, in6, in7; 64291037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 64391037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 64491037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 64591037db265ecdd914a26e056cf69207b4f50924ehkuang 64691037db265ecdd914a26e056cf69207b4f50924ehkuang in0 = in[0]; 64791037db265ecdd914a26e056cf69207b4f50924ehkuang in1 = in[1]; 64891037db265ecdd914a26e056cf69207b4f50924ehkuang in2 = in[2]; 64991037db265ecdd914a26e056cf69207b4f50924ehkuang in3 = in[3]; 65091037db265ecdd914a26e056cf69207b4f50924ehkuang in4 = in[4]; 65191037db265ecdd914a26e056cf69207b4f50924ehkuang in5 = in[5]; 65291037db265ecdd914a26e056cf69207b4f50924ehkuang in6 = in[6]; 65391037db265ecdd914a26e056cf69207b4f50924ehkuang in7 = in[7]; 65491037db265ecdd914a26e056cf69207b4f50924ehkuang 65591037db265ecdd914a26e056cf69207b4f50924ehkuang // 8x8 Transpose is copied from vp9_short_fdct8x8_sse2() 65691037db265ecdd914a26e056cf69207b4f50924ehkuang TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 65791037db265ecdd914a26e056cf69207b4f50924ehkuang in4, in5, in6, in7); 65891037db265ecdd914a26e056cf69207b4f50924ehkuang 65991037db265ecdd914a26e056cf69207b4f50924ehkuang // 4-stage 1D idct8x8 66091037db265ecdd914a26e056cf69207b4f50924ehkuang IDCT8x8_1D 66191037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = in0; 66291037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = in1; 66391037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = in2; 66491037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = in3; 66591037db265ecdd914a26e056cf69207b4f50924ehkuang in[4] = in4; 66691037db265ecdd914a26e056cf69207b4f50924ehkuang in[5] = in5; 66791037db265ecdd914a26e056cf69207b4f50924ehkuang in[6] = in6; 66891037db265ecdd914a26e056cf69207b4f50924ehkuang in[7] = in7; 66991037db265ecdd914a26e056cf69207b4f50924ehkuang} 67091037db265ecdd914a26e056cf69207b4f50924ehkuang 67191037db265ecdd914a26e056cf69207b4f50924ehkuangvoid iadst8_1d_sse2(__m128i *in) { 67291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); 67391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); 67491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); 67591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); 67691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); 67791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); 67891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); 67991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); 68091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 68191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 68291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 68391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 68491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 68591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__const_0 = _mm_set1_epi16(0); 68691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 68791037db265ecdd914a26e056cf69207b4f50924ehkuang 68891037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; 68991037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; 69091037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; 69191037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s0, s1, s2, s3, s4, s5, s6, s7; 69291037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in0, in1, in2, in3, in4, in5, in6, in7; 69391037db265ecdd914a26e056cf69207b4f50924ehkuang 69491037db265ecdd914a26e056cf69207b4f50924ehkuang // transpose 69591037db265ecdd914a26e056cf69207b4f50924ehkuang array_transpose_8x8(in, in); 69691037db265ecdd914a26e056cf69207b4f50924ehkuang 69791037db265ecdd914a26e056cf69207b4f50924ehkuang // properly aligned for butterfly input 69891037db265ecdd914a26e056cf69207b4f50924ehkuang in0 = in[7]; 69991037db265ecdd914a26e056cf69207b4f50924ehkuang in1 = in[0]; 70091037db265ecdd914a26e056cf69207b4f50924ehkuang in2 = in[5]; 70191037db265ecdd914a26e056cf69207b4f50924ehkuang in3 = in[2]; 70291037db265ecdd914a26e056cf69207b4f50924ehkuang in4 = in[3]; 70391037db265ecdd914a26e056cf69207b4f50924ehkuang in5 = in[4]; 70491037db265ecdd914a26e056cf69207b4f50924ehkuang in6 = in[1]; 70591037db265ecdd914a26e056cf69207b4f50924ehkuang in7 = in[6]; 70691037db265ecdd914a26e056cf69207b4f50924ehkuang 70791037db265ecdd914a26e056cf69207b4f50924ehkuang // column transformation 70891037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 1 70991037db265ecdd914a26e056cf69207b4f50924ehkuang // interleave and multiply/add into 32-bit integer 71091037db265ecdd914a26e056cf69207b4f50924ehkuang s0 = _mm_unpacklo_epi16(in0, in1); 71191037db265ecdd914a26e056cf69207b4f50924ehkuang s1 = _mm_unpackhi_epi16(in0, in1); 71291037db265ecdd914a26e056cf69207b4f50924ehkuang s2 = _mm_unpacklo_epi16(in2, in3); 71391037db265ecdd914a26e056cf69207b4f50924ehkuang s3 = _mm_unpackhi_epi16(in2, in3); 71491037db265ecdd914a26e056cf69207b4f50924ehkuang s4 = _mm_unpacklo_epi16(in4, in5); 71591037db265ecdd914a26e056cf69207b4f50924ehkuang s5 = _mm_unpackhi_epi16(in4, in5); 71691037db265ecdd914a26e056cf69207b4f50924ehkuang s6 = _mm_unpacklo_epi16(in6, in7); 71791037db265ecdd914a26e056cf69207b4f50924ehkuang s7 = _mm_unpackhi_epi16(in6, in7); 71891037db265ecdd914a26e056cf69207b4f50924ehkuang 71991037db265ecdd914a26e056cf69207b4f50924ehkuang u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); 72091037db265ecdd914a26e056cf69207b4f50924ehkuang u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); 72191037db265ecdd914a26e056cf69207b4f50924ehkuang u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); 72291037db265ecdd914a26e056cf69207b4f50924ehkuang u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); 72391037db265ecdd914a26e056cf69207b4f50924ehkuang u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); 72491037db265ecdd914a26e056cf69207b4f50924ehkuang u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); 72591037db265ecdd914a26e056cf69207b4f50924ehkuang u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); 72691037db265ecdd914a26e056cf69207b4f50924ehkuang u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); 72791037db265ecdd914a26e056cf69207b4f50924ehkuang u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); 72891037db265ecdd914a26e056cf69207b4f50924ehkuang u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); 72991037db265ecdd914a26e056cf69207b4f50924ehkuang u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); 73091037db265ecdd914a26e056cf69207b4f50924ehkuang u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); 73191037db265ecdd914a26e056cf69207b4f50924ehkuang u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); 73291037db265ecdd914a26e056cf69207b4f50924ehkuang u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); 73391037db265ecdd914a26e056cf69207b4f50924ehkuang u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); 73491037db265ecdd914a26e056cf69207b4f50924ehkuang u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); 73591037db265ecdd914a26e056cf69207b4f50924ehkuang 73691037db265ecdd914a26e056cf69207b4f50924ehkuang // addition 73791037db265ecdd914a26e056cf69207b4f50924ehkuang w0 = _mm_add_epi32(u0, u8); 73891037db265ecdd914a26e056cf69207b4f50924ehkuang w1 = _mm_add_epi32(u1, u9); 73991037db265ecdd914a26e056cf69207b4f50924ehkuang w2 = _mm_add_epi32(u2, u10); 74091037db265ecdd914a26e056cf69207b4f50924ehkuang w3 = _mm_add_epi32(u3, u11); 74191037db265ecdd914a26e056cf69207b4f50924ehkuang w4 = _mm_add_epi32(u4, u12); 74291037db265ecdd914a26e056cf69207b4f50924ehkuang w5 = _mm_add_epi32(u5, u13); 74391037db265ecdd914a26e056cf69207b4f50924ehkuang w6 = _mm_add_epi32(u6, u14); 74491037db265ecdd914a26e056cf69207b4f50924ehkuang w7 = _mm_add_epi32(u7, u15); 74591037db265ecdd914a26e056cf69207b4f50924ehkuang w8 = _mm_sub_epi32(u0, u8); 74691037db265ecdd914a26e056cf69207b4f50924ehkuang w9 = _mm_sub_epi32(u1, u9); 74791037db265ecdd914a26e056cf69207b4f50924ehkuang w10 = _mm_sub_epi32(u2, u10); 74891037db265ecdd914a26e056cf69207b4f50924ehkuang w11 = _mm_sub_epi32(u3, u11); 74991037db265ecdd914a26e056cf69207b4f50924ehkuang w12 = _mm_sub_epi32(u4, u12); 75091037db265ecdd914a26e056cf69207b4f50924ehkuang w13 = _mm_sub_epi32(u5, u13); 75191037db265ecdd914a26e056cf69207b4f50924ehkuang w14 = _mm_sub_epi32(u6, u14); 75291037db265ecdd914a26e056cf69207b4f50924ehkuang w15 = _mm_sub_epi32(u7, u15); 75391037db265ecdd914a26e056cf69207b4f50924ehkuang 75491037db265ecdd914a26e056cf69207b4f50924ehkuang // shift and rounding 75591037db265ecdd914a26e056cf69207b4f50924ehkuang v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 75691037db265ecdd914a26e056cf69207b4f50924ehkuang v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 75791037db265ecdd914a26e056cf69207b4f50924ehkuang v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 75891037db265ecdd914a26e056cf69207b4f50924ehkuang v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 75991037db265ecdd914a26e056cf69207b4f50924ehkuang v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 76091037db265ecdd914a26e056cf69207b4f50924ehkuang v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 76191037db265ecdd914a26e056cf69207b4f50924ehkuang v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 76291037db265ecdd914a26e056cf69207b4f50924ehkuang v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 76391037db265ecdd914a26e056cf69207b4f50924ehkuang v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); 76491037db265ecdd914a26e056cf69207b4f50924ehkuang v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); 76591037db265ecdd914a26e056cf69207b4f50924ehkuang v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); 76691037db265ecdd914a26e056cf69207b4f50924ehkuang v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); 76791037db265ecdd914a26e056cf69207b4f50924ehkuang v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); 76891037db265ecdd914a26e056cf69207b4f50924ehkuang v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); 76991037db265ecdd914a26e056cf69207b4f50924ehkuang v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); 77091037db265ecdd914a26e056cf69207b4f50924ehkuang v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); 77191037db265ecdd914a26e056cf69207b4f50924ehkuang 77291037db265ecdd914a26e056cf69207b4f50924ehkuang u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 77391037db265ecdd914a26e056cf69207b4f50924ehkuang u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 77491037db265ecdd914a26e056cf69207b4f50924ehkuang u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 77591037db265ecdd914a26e056cf69207b4f50924ehkuang u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 77691037db265ecdd914a26e056cf69207b4f50924ehkuang u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 77791037db265ecdd914a26e056cf69207b4f50924ehkuang u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 77891037db265ecdd914a26e056cf69207b4f50924ehkuang u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 77991037db265ecdd914a26e056cf69207b4f50924ehkuang u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 78091037db265ecdd914a26e056cf69207b4f50924ehkuang u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); 78191037db265ecdd914a26e056cf69207b4f50924ehkuang u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); 78291037db265ecdd914a26e056cf69207b4f50924ehkuang u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); 78391037db265ecdd914a26e056cf69207b4f50924ehkuang u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); 78491037db265ecdd914a26e056cf69207b4f50924ehkuang u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); 78591037db265ecdd914a26e056cf69207b4f50924ehkuang u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); 78691037db265ecdd914a26e056cf69207b4f50924ehkuang u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); 78791037db265ecdd914a26e056cf69207b4f50924ehkuang u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); 78891037db265ecdd914a26e056cf69207b4f50924ehkuang 78991037db265ecdd914a26e056cf69207b4f50924ehkuang // back to 16-bit and pack 8 integers into __m128i 79091037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_packs_epi32(u0, u1); 79191037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_packs_epi32(u2, u3); 79291037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = _mm_packs_epi32(u4, u5); 79391037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_packs_epi32(u6, u7); 79491037db265ecdd914a26e056cf69207b4f50924ehkuang in[4] = _mm_packs_epi32(u8, u9); 79591037db265ecdd914a26e056cf69207b4f50924ehkuang in[5] = _mm_packs_epi32(u10, u11); 79691037db265ecdd914a26e056cf69207b4f50924ehkuang in[6] = _mm_packs_epi32(u12, u13); 79791037db265ecdd914a26e056cf69207b4f50924ehkuang in[7] = _mm_packs_epi32(u14, u15); 79891037db265ecdd914a26e056cf69207b4f50924ehkuang 79991037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 2 80091037db265ecdd914a26e056cf69207b4f50924ehkuang s0 = _mm_add_epi16(in[0], in[2]); 80191037db265ecdd914a26e056cf69207b4f50924ehkuang s1 = _mm_add_epi16(in[1], in[3]); 80291037db265ecdd914a26e056cf69207b4f50924ehkuang s2 = _mm_sub_epi16(in[0], in[2]); 80391037db265ecdd914a26e056cf69207b4f50924ehkuang s3 = _mm_sub_epi16(in[1], in[3]); 80491037db265ecdd914a26e056cf69207b4f50924ehkuang u0 = _mm_unpacklo_epi16(in[4], in[5]); 80591037db265ecdd914a26e056cf69207b4f50924ehkuang u1 = _mm_unpackhi_epi16(in[4], in[5]); 80691037db265ecdd914a26e056cf69207b4f50924ehkuang u2 = _mm_unpacklo_epi16(in[6], in[7]); 80791037db265ecdd914a26e056cf69207b4f50924ehkuang u3 = _mm_unpackhi_epi16(in[6], in[7]); 80891037db265ecdd914a26e056cf69207b4f50924ehkuang 80991037db265ecdd914a26e056cf69207b4f50924ehkuang v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); 81091037db265ecdd914a26e056cf69207b4f50924ehkuang v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); 81191037db265ecdd914a26e056cf69207b4f50924ehkuang v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); 81291037db265ecdd914a26e056cf69207b4f50924ehkuang v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); 81391037db265ecdd914a26e056cf69207b4f50924ehkuang v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); 81491037db265ecdd914a26e056cf69207b4f50924ehkuang v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); 81591037db265ecdd914a26e056cf69207b4f50924ehkuang v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); 81691037db265ecdd914a26e056cf69207b4f50924ehkuang v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); 81791037db265ecdd914a26e056cf69207b4f50924ehkuang 81891037db265ecdd914a26e056cf69207b4f50924ehkuang w0 = _mm_add_epi32(v0, v4); 81991037db265ecdd914a26e056cf69207b4f50924ehkuang w1 = _mm_add_epi32(v1, v5); 82091037db265ecdd914a26e056cf69207b4f50924ehkuang w2 = _mm_add_epi32(v2, v6); 82191037db265ecdd914a26e056cf69207b4f50924ehkuang w3 = _mm_add_epi32(v3, v7); 82291037db265ecdd914a26e056cf69207b4f50924ehkuang w4 = _mm_sub_epi32(v0, v4); 82391037db265ecdd914a26e056cf69207b4f50924ehkuang w5 = _mm_sub_epi32(v1, v5); 82491037db265ecdd914a26e056cf69207b4f50924ehkuang w6 = _mm_sub_epi32(v2, v6); 82591037db265ecdd914a26e056cf69207b4f50924ehkuang w7 = _mm_sub_epi32(v3, v7); 82691037db265ecdd914a26e056cf69207b4f50924ehkuang 82791037db265ecdd914a26e056cf69207b4f50924ehkuang v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 82891037db265ecdd914a26e056cf69207b4f50924ehkuang v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 82991037db265ecdd914a26e056cf69207b4f50924ehkuang v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 83091037db265ecdd914a26e056cf69207b4f50924ehkuang v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 83191037db265ecdd914a26e056cf69207b4f50924ehkuang v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 83291037db265ecdd914a26e056cf69207b4f50924ehkuang v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 83391037db265ecdd914a26e056cf69207b4f50924ehkuang v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 83491037db265ecdd914a26e056cf69207b4f50924ehkuang v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 83591037db265ecdd914a26e056cf69207b4f50924ehkuang 83691037db265ecdd914a26e056cf69207b4f50924ehkuang u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 83791037db265ecdd914a26e056cf69207b4f50924ehkuang u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 83891037db265ecdd914a26e056cf69207b4f50924ehkuang u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 83991037db265ecdd914a26e056cf69207b4f50924ehkuang u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 84091037db265ecdd914a26e056cf69207b4f50924ehkuang u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 84191037db265ecdd914a26e056cf69207b4f50924ehkuang u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 84291037db265ecdd914a26e056cf69207b4f50924ehkuang u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 84391037db265ecdd914a26e056cf69207b4f50924ehkuang u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 84491037db265ecdd914a26e056cf69207b4f50924ehkuang 84591037db265ecdd914a26e056cf69207b4f50924ehkuang // back to 16-bit intergers 84691037db265ecdd914a26e056cf69207b4f50924ehkuang s4 = _mm_packs_epi32(u0, u1); 84791037db265ecdd914a26e056cf69207b4f50924ehkuang s5 = _mm_packs_epi32(u2, u3); 84891037db265ecdd914a26e056cf69207b4f50924ehkuang s6 = _mm_packs_epi32(u4, u5); 84991037db265ecdd914a26e056cf69207b4f50924ehkuang s7 = _mm_packs_epi32(u6, u7); 85091037db265ecdd914a26e056cf69207b4f50924ehkuang 85191037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 3 85291037db265ecdd914a26e056cf69207b4f50924ehkuang u0 = _mm_unpacklo_epi16(s2, s3); 85391037db265ecdd914a26e056cf69207b4f50924ehkuang u1 = _mm_unpackhi_epi16(s2, s3); 85491037db265ecdd914a26e056cf69207b4f50924ehkuang u2 = _mm_unpacklo_epi16(s6, s7); 85591037db265ecdd914a26e056cf69207b4f50924ehkuang u3 = _mm_unpackhi_epi16(s6, s7); 85691037db265ecdd914a26e056cf69207b4f50924ehkuang 85791037db265ecdd914a26e056cf69207b4f50924ehkuang v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); 85891037db265ecdd914a26e056cf69207b4f50924ehkuang v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); 85991037db265ecdd914a26e056cf69207b4f50924ehkuang v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); 86091037db265ecdd914a26e056cf69207b4f50924ehkuang v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); 86191037db265ecdd914a26e056cf69207b4f50924ehkuang v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); 86291037db265ecdd914a26e056cf69207b4f50924ehkuang v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); 86391037db265ecdd914a26e056cf69207b4f50924ehkuang v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); 86491037db265ecdd914a26e056cf69207b4f50924ehkuang v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); 86591037db265ecdd914a26e056cf69207b4f50924ehkuang 86691037db265ecdd914a26e056cf69207b4f50924ehkuang u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 86791037db265ecdd914a26e056cf69207b4f50924ehkuang u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 86891037db265ecdd914a26e056cf69207b4f50924ehkuang u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 86991037db265ecdd914a26e056cf69207b4f50924ehkuang u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 87091037db265ecdd914a26e056cf69207b4f50924ehkuang u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); 87191037db265ecdd914a26e056cf69207b4f50924ehkuang u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); 87291037db265ecdd914a26e056cf69207b4f50924ehkuang u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); 87391037db265ecdd914a26e056cf69207b4f50924ehkuang u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); 87491037db265ecdd914a26e056cf69207b4f50924ehkuang 87591037db265ecdd914a26e056cf69207b4f50924ehkuang v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 87691037db265ecdd914a26e056cf69207b4f50924ehkuang v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 87791037db265ecdd914a26e056cf69207b4f50924ehkuang v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 87891037db265ecdd914a26e056cf69207b4f50924ehkuang v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 87991037db265ecdd914a26e056cf69207b4f50924ehkuang v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); 88091037db265ecdd914a26e056cf69207b4f50924ehkuang v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); 88191037db265ecdd914a26e056cf69207b4f50924ehkuang v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); 88291037db265ecdd914a26e056cf69207b4f50924ehkuang v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); 88391037db265ecdd914a26e056cf69207b4f50924ehkuang 88491037db265ecdd914a26e056cf69207b4f50924ehkuang s2 = _mm_packs_epi32(v0, v1); 88591037db265ecdd914a26e056cf69207b4f50924ehkuang s3 = _mm_packs_epi32(v2, v3); 88691037db265ecdd914a26e056cf69207b4f50924ehkuang s6 = _mm_packs_epi32(v4, v5); 88791037db265ecdd914a26e056cf69207b4f50924ehkuang s7 = _mm_packs_epi32(v6, v7); 88891037db265ecdd914a26e056cf69207b4f50924ehkuang 88991037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = s0; 89091037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_sub_epi16(k__const_0, s4); 89191037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = s6; 89291037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_sub_epi16(k__const_0, s2); 89391037db265ecdd914a26e056cf69207b4f50924ehkuang in[4] = s3; 89491037db265ecdd914a26e056cf69207b4f50924ehkuang in[5] = _mm_sub_epi16(k__const_0, s7); 89591037db265ecdd914a26e056cf69207b4f50924ehkuang in[6] = s5; 89691037db265ecdd914a26e056cf69207b4f50924ehkuang in[7] = _mm_sub_epi16(k__const_0, s1); 89791037db265ecdd914a26e056cf69207b4f50924ehkuang} 89891037db265ecdd914a26e056cf69207b4f50924ehkuang 89991037db265ecdd914a26e056cf69207b4f50924ehkuang 90091037db265ecdd914a26e056cf69207b4f50924ehkuangvoid vp9_short_iht8x8_add_sse2(int16_t *input, uint8_t *dest, int stride, 90191037db265ecdd914a26e056cf69207b4f50924ehkuang int tx_type) { 90291037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in[8]; 90391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i zero = _mm_setzero_si128(); 90491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i final_rounding = _mm_set1_epi16(1<<4); 90591037db265ecdd914a26e056cf69207b4f50924ehkuang 90691037db265ecdd914a26e056cf69207b4f50924ehkuang // load input data 90791037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_load_si128((__m128i *)input); 90891037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_load_si128((__m128i *)(input + 8 * 1)); 90991037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = _mm_load_si128((__m128i *)(input + 8 * 2)); 91091037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_load_si128((__m128i *)(input + 8 * 3)); 91191037db265ecdd914a26e056cf69207b4f50924ehkuang in[4] = _mm_load_si128((__m128i *)(input + 8 * 4)); 91291037db265ecdd914a26e056cf69207b4f50924ehkuang in[5] = _mm_load_si128((__m128i *)(input + 8 * 5)); 91391037db265ecdd914a26e056cf69207b4f50924ehkuang in[6] = _mm_load_si128((__m128i *)(input + 8 * 6)); 91491037db265ecdd914a26e056cf69207b4f50924ehkuang in[7] = _mm_load_si128((__m128i *)(input + 8 * 7)); 91591037db265ecdd914a26e056cf69207b4f50924ehkuang 91691037db265ecdd914a26e056cf69207b4f50924ehkuang switch (tx_type) { 91791037db265ecdd914a26e056cf69207b4f50924ehkuang case 0: // DCT_DCT 91891037db265ecdd914a26e056cf69207b4f50924ehkuang idct8_1d_sse2(in); 91991037db265ecdd914a26e056cf69207b4f50924ehkuang idct8_1d_sse2(in); 92091037db265ecdd914a26e056cf69207b4f50924ehkuang break; 92191037db265ecdd914a26e056cf69207b4f50924ehkuang case 1: // ADST_DCT 92291037db265ecdd914a26e056cf69207b4f50924ehkuang idct8_1d_sse2(in); 92391037db265ecdd914a26e056cf69207b4f50924ehkuang iadst8_1d_sse2(in); 92491037db265ecdd914a26e056cf69207b4f50924ehkuang break; 92591037db265ecdd914a26e056cf69207b4f50924ehkuang case 2: // DCT_ADST 92691037db265ecdd914a26e056cf69207b4f50924ehkuang iadst8_1d_sse2(in); 92791037db265ecdd914a26e056cf69207b4f50924ehkuang idct8_1d_sse2(in); 92891037db265ecdd914a26e056cf69207b4f50924ehkuang break; 92991037db265ecdd914a26e056cf69207b4f50924ehkuang case 3: // ADST_ADST 93091037db265ecdd914a26e056cf69207b4f50924ehkuang iadst8_1d_sse2(in); 93191037db265ecdd914a26e056cf69207b4f50924ehkuang iadst8_1d_sse2(in); 93291037db265ecdd914a26e056cf69207b4f50924ehkuang break; 93391037db265ecdd914a26e056cf69207b4f50924ehkuang default: 93491037db265ecdd914a26e056cf69207b4f50924ehkuang assert(0); 93591037db265ecdd914a26e056cf69207b4f50924ehkuang break; 93691037db265ecdd914a26e056cf69207b4f50924ehkuang } 93791037db265ecdd914a26e056cf69207b4f50924ehkuang 93891037db265ecdd914a26e056cf69207b4f50924ehkuang // Final rounding and shift 93991037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_adds_epi16(in[0], final_rounding); 94091037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_adds_epi16(in[1], final_rounding); 94191037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = _mm_adds_epi16(in[2], final_rounding); 94291037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_adds_epi16(in[3], final_rounding); 94391037db265ecdd914a26e056cf69207b4f50924ehkuang in[4] = _mm_adds_epi16(in[4], final_rounding); 94491037db265ecdd914a26e056cf69207b4f50924ehkuang in[5] = _mm_adds_epi16(in[5], final_rounding); 94591037db265ecdd914a26e056cf69207b4f50924ehkuang in[6] = _mm_adds_epi16(in[6], final_rounding); 94691037db265ecdd914a26e056cf69207b4f50924ehkuang in[7] = _mm_adds_epi16(in[7], final_rounding); 94791037db265ecdd914a26e056cf69207b4f50924ehkuang 94891037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_srai_epi16(in[0], 5); 94991037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_srai_epi16(in[1], 5); 95091037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = _mm_srai_epi16(in[2], 5); 95191037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_srai_epi16(in[3], 5); 95291037db265ecdd914a26e056cf69207b4f50924ehkuang in[4] = _mm_srai_epi16(in[4], 5); 95391037db265ecdd914a26e056cf69207b4f50924ehkuang in[5] = _mm_srai_epi16(in[5], 5); 95491037db265ecdd914a26e056cf69207b4f50924ehkuang in[6] = _mm_srai_epi16(in[6], 5); 95591037db265ecdd914a26e056cf69207b4f50924ehkuang in[7] = _mm_srai_epi16(in[7], 5); 95691037db265ecdd914a26e056cf69207b4f50924ehkuang 95791037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[0]); 95891037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[1]); 95991037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[2]); 96091037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[3]); 96191037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[4]); 96291037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[5]); 96391037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[6]); 96491037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[7]); 96591037db265ecdd914a26e056cf69207b4f50924ehkuang} 96691037db265ecdd914a26e056cf69207b4f50924ehkuang 967ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangvoid vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { 968ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i zero = _mm_setzero_si128(); 969ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 970ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i final_rounding = _mm_set1_epi16(1<<4); 971ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 972ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 973ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 974ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 975ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 976ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 977ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 978ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 979ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 980ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 981ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i in0, in1, in2, in3, in4, in5, in6, in7; 982ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 983ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 984ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 985ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 986ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Rows. Load 4-row input data. 987ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_load_si128((__m128i *)input); 988ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_load_si128((__m128i *)(input + 8 * 1)); 989ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_load_si128((__m128i *)(input + 8 * 2)); 990ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_load_si128((__m128i *)(input + 8 * 3)); 991ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 992ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 8x4 Transpose 993ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3) 994ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 995ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage1 996ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 997ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3); 998ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2); 999ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1000ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_madd_epi16(lo_17, stg1_0); 1001ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_madd_epi16(lo_17, stg1_1); 1002ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_madd_epi16(lo_35, stg1_2); 1003ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_madd_epi16(lo_35, stg1_3); 1004ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1005ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_add_epi32(tmp0, rounding); 1006ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_add_epi32(tmp2, rounding); 1007ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_add_epi32(tmp4, rounding); 1008ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_add_epi32(tmp6, rounding); 1009ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 1010ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 1011ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 1012ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 1013ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1014ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_4 = _mm_packs_epi32(tmp0, zero); 1015ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_7 = _mm_packs_epi32(tmp2, zero); 1016ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_5 = _mm_packs_epi32(tmp4, zero); 1017ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_6 = _mm_packs_epi32(tmp6, zero); 1018ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1019ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1020ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage2 1021ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 1022ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2); 1023ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3); 1024ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1025ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_madd_epi16(lo_04, stg2_0); 1026ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_madd_epi16(lo_04, stg2_1); 1027ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_madd_epi16(lo_26, stg2_2); 1028ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_madd_epi16(lo_26, stg2_3); 1029ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1030ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_add_epi32(tmp0, rounding); 1031ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_add_epi32(tmp2, rounding); 1032ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_add_epi32(tmp4, rounding); 1033ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_add_epi32(tmp6, rounding); 1034ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 1035ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 1036ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 1037ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 1038ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1039ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_0 = _mm_packs_epi32(tmp0, zero); 1040ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_1 = _mm_packs_epi32(tmp2, zero); 1041ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_2 = _mm_packs_epi32(tmp4, zero); 1042ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_3 = _mm_packs_epi32(tmp6, zero); 1043ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1044ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); 1045ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); 1046ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); 1047ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); 1048ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1049ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1050ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage3 1051ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 1052ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); 1053ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); 1054ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); 1055ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); 1056ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); 1057ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1058ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_madd_epi16(lo_56, stg3_0); 1059ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 1060ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1061ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_add_epi32(tmp0, rounding); 1062ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_add_epi32(tmp2, rounding); 1063ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 1064ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 1065ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1066ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_5 = _mm_packs_epi32(tmp0, zero); 1067ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_6 = _mm_packs_epi32(tmp2, zero); 1068ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1069ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1070ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage4 1071ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_adds_epi16(stp1_0, stp2_7); 1072ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_adds_epi16(stp1_1, stp1_6); 1073ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_adds_epi16(stp1_2, stp1_5); 1074ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_adds_epi16(stp1_3, stp2_4); 1075ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4 = _mm_subs_epi16(stp1_3, stp2_4); 1076ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in5 = _mm_subs_epi16(stp1_2, stp1_5); 1077ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in6 = _mm_subs_epi16(stp1_1, stp1_6); 1078ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in7 = _mm_subs_epi16(stp1_0, stp2_7); 1079ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1080ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Columns. 4x8 Transpose 1081ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 1082ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4, in5, in6, in7) 1083ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1084ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 1D idct8x8 1085ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang IDCT8x8_1D 1086ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1087ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Final rounding and shift 1088ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_adds_epi16(in0, final_rounding); 1089ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_adds_epi16(in1, final_rounding); 1090ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_adds_epi16(in2, final_rounding); 1091ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_adds_epi16(in3, final_rounding); 1092ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4 = _mm_adds_epi16(in4, final_rounding); 1093ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in5 = _mm_adds_epi16(in5, final_rounding); 1094ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in6 = _mm_adds_epi16(in6, final_rounding); 1095ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in7 = _mm_adds_epi16(in7, final_rounding); 1096ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1097ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_srai_epi16(in0, 5); 1098ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_srai_epi16(in1, 5); 1099ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_srai_epi16(in2, 5); 1100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_srai_epi16(in3, 5); 1101ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4 = _mm_srai_epi16(in4, 5); 1102ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in5 = _mm_srai_epi16(in5, 5); 1103ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in6 = _mm_srai_epi16(in6, 5); 1104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in7 = _mm_srai_epi16(in7, 5); 1105ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1106ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in0); 1107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in1); 1108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in2); 1109ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in3); 1110ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in4); 1111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in5); 1112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in6); 1113ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in7); 1114ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang} 1115ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1116ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define IDCT16x16_1D \ 1117ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* Stage2 */ \ 1118ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { \ 1119ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \ 1120ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \ 1121ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7); \ 1122ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_9_7 = _mm_unpackhi_epi16(in9, in7); \ 1123ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_5_11 = _mm_unpacklo_epi16(in5, in11); \ 1124ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \ 1125ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \ 1126ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \ 1127ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1128ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \ 1129ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg2_0, stg2_1, stg2_2, stg2_3, \ 1130ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_8, stp2_15, stp2_9, stp2_14) \ 1131ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1132ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \ 1133ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg2_4, stg2_5, stg2_6, stg2_7, \ 1134ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_10, stp2_13, stp2_11, stp2_12) \ 1135ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } \ 1136ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1137ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* Stage3 */ \ 1138ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { \ 1139ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \ 1140ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \ 1141ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \ 1142ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \ 1143ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1144ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \ 1145ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg3_0, stg3_1, stg3_2, stg3_3, \ 1146ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_4, stp1_7, stp1_5, stp1_6) \ 1147ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1148ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ 1149ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ 1150ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ 1151ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ 1152ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1153ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ 1154ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ 1155ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ 1156ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ 1157ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } \ 1158ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1159ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* Stage4 */ \ 1160ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { \ 1161ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \ 1162ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \ 1163ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \ 1164ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \ 1165ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1166ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ 1167ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ 1168ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 1169ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 1170ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1171ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \ 1172ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg4_0, stg4_1, stg4_2, stg4_3, \ 1173ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_0, stp2_1, stp2_2, stp2_3) \ 1174ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1175ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ 1176ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ 1177ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ 1178ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ 1179ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1180ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ 1181ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg4_4, stg4_5, stg4_6, stg4_7, \ 1182ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_9, stp2_14, stp2_10, stp2_13) \ 1183ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } \ 1184ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1185ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* Stage5 */ \ 1186ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { \ 1187ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 1188ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 1189ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1190ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ 1191ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ 1192ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ 1193ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ 1194ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1195ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 1196ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 1197ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 1198ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 1199ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1200ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_add_epi32(tmp0, rounding); \ 1201ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_add_epi32(tmp1, rounding); \ 1202ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_add_epi32(tmp2, rounding); \ 1203ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_add_epi32(tmp3, rounding); \ 1204ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1205ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 1206ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 1207ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 1208ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 1209ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1210ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 1211ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 1212ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1213ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ 1214ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 1215ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 1216ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ 1217ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1218ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ 1219ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 1220ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 1221ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ 1222ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } \ 1223ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1224ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* Stage6 */ \ 1225ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { \ 1226ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 1227ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 1228ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 1229ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 1230ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1231ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ 1232ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 1233ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 1234ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ 1235ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ 1236ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 1237ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 1238ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ 1239ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang \ 1240ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ 1241ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg6_0, stg4_0, stg6_0, stg4_0, \ 1242ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_10, stp2_13, stp2_11, stp2_12) \ 1243ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1244ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1245ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangvoid vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) { 1246ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 1247ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i final_rounding = _mm_set1_epi16(1<<5); 1248ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i zero = _mm_setzero_si128(); 1249ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1250ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 1251ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 1252ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 1253ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 1254ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 1255ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 1256ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 1257ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 1258ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1259ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1260ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 1261ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1262ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 1263ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1264ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 1265ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1266ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1267ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 1268ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1269ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 1270ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1271ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1272ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1273ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1274ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1275ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero, 1276ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero, 1277ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in10 = zero, in11 = zero, in12 = zero, in13 = zero, 1278ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in14 = zero, in15 = zero; 1279ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero, 1280ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero, 1281ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l12 = zero, l13 = zero, l14 = zero, l15 = zero; 1282ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i r0 = zero, r1 = zero, r2 = zero, r3 = zero, r4 = zero, r5 = zero, 1283ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang r6 = zero, r7 = zero, r8 = zero, r9 = zero, r10 = zero, r11 = zero, 1284ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang r12 = zero, r13 = zero, r14 = zero, r15 = zero; 1285ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 1286ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 1287ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_8_0, stp1_12_0; 1288ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 1289ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; 1290ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 1291ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang int i; 1292ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1293ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // We work on a 8x16 block each time, and loop 4 times for 2-D 16x16 idct. 1294ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang for (i = 0; i < 4; i++) { 1295ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 1-D idct 1296ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang if (i < 2) { 1297ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang if (i == 1) input += 128; 1298ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1299ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Load input data. 1300ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_load_si128((__m128i *)input); 1301ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in8 = _mm_load_si128((__m128i *)(input + 8 * 1)); 1302ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_load_si128((__m128i *)(input + 8 * 2)); 1303ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in9 = _mm_load_si128((__m128i *)(input + 8 * 3)); 1304ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_load_si128((__m128i *)(input + 8 * 4)); 1305ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in10 = _mm_load_si128((__m128i *)(input + 8 * 5)); 1306ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_load_si128((__m128i *)(input + 8 * 6)); 1307ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in11 = _mm_load_si128((__m128i *)(input + 8 * 7)); 1308ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4 = _mm_load_si128((__m128i *)(input + 8 * 8)); 1309ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in12 = _mm_load_si128((__m128i *)(input + 8 * 9)); 1310ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in5 = _mm_load_si128((__m128i *)(input + 8 * 10)); 1311ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in13 = _mm_load_si128((__m128i *)(input + 8 * 11)); 1312ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in6 = _mm_load_si128((__m128i *)(input + 8 * 12)); 1313ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in14 = _mm_load_si128((__m128i *)(input + 8 * 13)); 1314ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in7 = _mm_load_si128((__m128i *)(input + 8 * 14)); 1315ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in15 = _mm_load_si128((__m128i *)(input + 8 * 15)); 1316ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1317ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 1318ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4, in5, in6, in7); 1319ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, 1320ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in10, in11, in12, in13, in14, in15); 1321ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1322ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1323ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang if (i == 2) { 1324ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang TRANSPOSE_8X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4, 1325ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in5, in6, in7); 1326ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang TRANSPOSE_8X8(r0, r1, r2, r3, r4, r5, r6, r7, in8, in9, in10, in11, in12, 1327ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in13, in14, in15); 1328ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1329ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1330ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang if (i == 3) { 1331ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang TRANSPOSE_8X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3, 1332ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4, in5, in6, in7); 1333ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang TRANSPOSE_8X8(r8, r9, r10, r11, r12, r13, r14, r15, in8, in9, in10, in11, 1334ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in12, in13, in14, in15); 1335ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1336ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1337ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang IDCT16x16_1D 1338ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1339ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage7 1340ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang if (i == 0) { 1341ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Left 8x16 1342ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l0 = _mm_add_epi16(stp2_0, stp1_15); 1343ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l1 = _mm_add_epi16(stp2_1, stp1_14); 1344ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l2 = _mm_add_epi16(stp2_2, stp2_13); 1345ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l3 = _mm_add_epi16(stp2_3, stp2_12); 1346ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l4 = _mm_add_epi16(stp2_4, stp2_11); 1347ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l5 = _mm_add_epi16(stp2_5, stp2_10); 1348ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l6 = _mm_add_epi16(stp2_6, stp1_9); 1349ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l7 = _mm_add_epi16(stp2_7, stp1_8); 1350ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l8 = _mm_sub_epi16(stp2_7, stp1_8); 1351ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l9 = _mm_sub_epi16(stp2_6, stp1_9); 1352ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l10 = _mm_sub_epi16(stp2_5, stp2_10); 1353ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l11 = _mm_sub_epi16(stp2_4, stp2_11); 1354ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l12 = _mm_sub_epi16(stp2_3, stp2_12); 1355ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l13 = _mm_sub_epi16(stp2_2, stp2_13); 1356ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l14 = _mm_sub_epi16(stp2_1, stp1_14); 1357ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l15 = _mm_sub_epi16(stp2_0, stp1_15); 1358ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } else if (i == 1) { 1359ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Right 8x16 1360ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang r0 = _mm_add_epi16(stp2_0, stp1_15); 1361ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang r1 = _mm_add_epi16(stp2_1, stp1_14); 1362ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang r2 = _mm_add_epi16(stp2_2, stp2_13); 1363ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang r3 = _mm_add_epi16(stp2_3, stp2_12); 1364ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang r4 = _mm_add_epi16(stp2_4, stp2_11); 1365ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang r5 = _mm_add_epi16(stp2_5, stp2_10); 1366ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang r6 = _mm_add_epi16(stp2_6, stp1_9); 1367ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang r7 = _mm_add_epi16(stp2_7, stp1_8); 1368ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang r8 = _mm_sub_epi16(stp2_7, stp1_8); 1369ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang r9 = _mm_sub_epi16(stp2_6, stp1_9); 1370ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang r10 = _mm_sub_epi16(stp2_5, stp2_10); 1371ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang r11 = _mm_sub_epi16(stp2_4, stp2_11); 1372ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang r12 = _mm_sub_epi16(stp2_3, stp2_12); 1373ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang r13 = _mm_sub_epi16(stp2_2, stp2_13); 1374ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang r14 = _mm_sub_epi16(stp2_1, stp1_14); 1375ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang r15 = _mm_sub_epi16(stp2_0, stp1_15); 1376ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } else { 1377ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 2-D 1378ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_add_epi16(stp2_0, stp1_15); 1379ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_add_epi16(stp2_1, stp1_14); 1380ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_add_epi16(stp2_2, stp2_13); 1381ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_add_epi16(stp2_3, stp2_12); 1382ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4 = _mm_add_epi16(stp2_4, stp2_11); 1383ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in5 = _mm_add_epi16(stp2_5, stp2_10); 1384ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in6 = _mm_add_epi16(stp2_6, stp1_9); 1385ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in7 = _mm_add_epi16(stp2_7, stp1_8); 1386ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in8 = _mm_sub_epi16(stp2_7, stp1_8); 1387ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in9 = _mm_sub_epi16(stp2_6, stp1_9); 1388ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in10 = _mm_sub_epi16(stp2_5, stp2_10); 1389ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in11 = _mm_sub_epi16(stp2_4, stp2_11); 1390ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in12 = _mm_sub_epi16(stp2_3, stp2_12); 1391ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in13 = _mm_sub_epi16(stp2_2, stp2_13); 1392ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in14 = _mm_sub_epi16(stp2_1, stp1_14); 1393ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in15 = _mm_sub_epi16(stp2_0, stp1_15); 1394ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1395ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Final rounding and shift 1396ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_adds_epi16(in0, final_rounding); 1397ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_adds_epi16(in1, final_rounding); 1398ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_adds_epi16(in2, final_rounding); 1399ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_adds_epi16(in3, final_rounding); 1400ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4 = _mm_adds_epi16(in4, final_rounding); 1401ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in5 = _mm_adds_epi16(in5, final_rounding); 1402ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in6 = _mm_adds_epi16(in6, final_rounding); 1403ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in7 = _mm_adds_epi16(in7, final_rounding); 1404ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in8 = _mm_adds_epi16(in8, final_rounding); 1405ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in9 = _mm_adds_epi16(in9, final_rounding); 1406ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in10 = _mm_adds_epi16(in10, final_rounding); 1407ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in11 = _mm_adds_epi16(in11, final_rounding); 1408ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in12 = _mm_adds_epi16(in12, final_rounding); 1409ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in13 = _mm_adds_epi16(in13, final_rounding); 1410ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in14 = _mm_adds_epi16(in14, final_rounding); 1411ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in15 = _mm_adds_epi16(in15, final_rounding); 1412ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1413ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_srai_epi16(in0, 6); 1414ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_srai_epi16(in1, 6); 1415ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_srai_epi16(in2, 6); 1416ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_srai_epi16(in3, 6); 1417ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4 = _mm_srai_epi16(in4, 6); 1418ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in5 = _mm_srai_epi16(in5, 6); 1419ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in6 = _mm_srai_epi16(in6, 6); 1420ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in7 = _mm_srai_epi16(in7, 6); 1421ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in8 = _mm_srai_epi16(in8, 6); 1422ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in9 = _mm_srai_epi16(in9, 6); 1423ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in10 = _mm_srai_epi16(in10, 6); 1424ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in11 = _mm_srai_epi16(in11, 6); 1425ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in12 = _mm_srai_epi16(in12, 6); 1426ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in13 = _mm_srai_epi16(in13, 6); 1427ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in14 = _mm_srai_epi16(in14, 6); 1428ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in15 = _mm_srai_epi16(in15, 6); 1429ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1430ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in0); 1431ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in1); 1432ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in2); 1433ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in3); 1434ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in4); 1435ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in5); 1436ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in6); 1437ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in7); 1438ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in8); 1439ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in9); 1440ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in10); 1441ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in11); 1442ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in12); 1443ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in13); 1444ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in14); 1445ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in15); 1446ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1447ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang dest += 8 - (stride * 16); 1448ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1449ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 1450ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang} 1451ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 145291037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { 145391037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i tbuf[8]; 145491037db265ecdd914a26e056cf69207b4f50924ehkuang array_transpose_8x8(res0, res0); 145591037db265ecdd914a26e056cf69207b4f50924ehkuang array_transpose_8x8(res1, tbuf); 145691037db265ecdd914a26e056cf69207b4f50924ehkuang array_transpose_8x8(res0 + 8, res1); 145791037db265ecdd914a26e056cf69207b4f50924ehkuang array_transpose_8x8(res1 + 8, res1 + 8); 145891037db265ecdd914a26e056cf69207b4f50924ehkuang 145991037db265ecdd914a26e056cf69207b4f50924ehkuang res0[8] = tbuf[0]; 146091037db265ecdd914a26e056cf69207b4f50924ehkuang res0[9] = tbuf[1]; 146191037db265ecdd914a26e056cf69207b4f50924ehkuang res0[10] = tbuf[2]; 146291037db265ecdd914a26e056cf69207b4f50924ehkuang res0[11] = tbuf[3]; 146391037db265ecdd914a26e056cf69207b4f50924ehkuang res0[12] = tbuf[4]; 146491037db265ecdd914a26e056cf69207b4f50924ehkuang res0[13] = tbuf[5]; 146591037db265ecdd914a26e056cf69207b4f50924ehkuang res0[14] = tbuf[6]; 146691037db265ecdd914a26e056cf69207b4f50924ehkuang res0[15] = tbuf[7]; 146791037db265ecdd914a26e056cf69207b4f50924ehkuang} 146891037db265ecdd914a26e056cf69207b4f50924ehkuang 146991037db265ecdd914a26e056cf69207b4f50924ehkuangvoid iadst16_1d_8col(__m128i *in) { 147091037db265ecdd914a26e056cf69207b4f50924ehkuang // perform 16x16 1-D ADST for 8 columns 147191037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i s[16], x[16], u[32], v[32]; 147291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); 147391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); 147491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); 147591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); 147691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); 147791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); 147891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); 147991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); 148091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); 148191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); 148291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); 148391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); 148491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); 148591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); 148691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); 148791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); 148891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); 148991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); 149091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); 149191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); 149291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); 149391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); 149491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 149591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 149691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 149791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); 149891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 149991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 150091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 150191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 150291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i kZero = _mm_set1_epi16(0); 150391037db265ecdd914a26e056cf69207b4f50924ehkuang 150491037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_unpacklo_epi16(in[15], in[0]); 150591037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_unpackhi_epi16(in[15], in[0]); 150691037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_unpacklo_epi16(in[13], in[2]); 150791037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_unpackhi_epi16(in[13], in[2]); 150891037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_unpacklo_epi16(in[11], in[4]); 150991037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_unpackhi_epi16(in[11], in[4]); 151091037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_unpacklo_epi16(in[9], in[6]); 151191037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_unpackhi_epi16(in[9], in[6]); 151291037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_unpacklo_epi16(in[7], in[8]); 151391037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_unpackhi_epi16(in[7], in[8]); 151491037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_unpacklo_epi16(in[5], in[10]); 151591037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_unpackhi_epi16(in[5], in[10]); 151691037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_unpacklo_epi16(in[3], in[12]); 151791037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_unpackhi_epi16(in[3], in[12]); 151891037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_unpacklo_epi16(in[1], in[14]); 151991037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_unpackhi_epi16(in[1], in[14]); 152091037db265ecdd914a26e056cf69207b4f50924ehkuang 152191037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); 152291037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); 152391037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); 152491037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); 152591037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); 152691037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); 152791037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); 152891037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); 152991037db265ecdd914a26e056cf69207b4f50924ehkuang v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); 153091037db265ecdd914a26e056cf69207b4f50924ehkuang v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); 153191037db265ecdd914a26e056cf69207b4f50924ehkuang v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); 153291037db265ecdd914a26e056cf69207b4f50924ehkuang v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); 153391037db265ecdd914a26e056cf69207b4f50924ehkuang v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); 153491037db265ecdd914a26e056cf69207b4f50924ehkuang v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); 153591037db265ecdd914a26e056cf69207b4f50924ehkuang v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); 153691037db265ecdd914a26e056cf69207b4f50924ehkuang v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); 153791037db265ecdd914a26e056cf69207b4f50924ehkuang v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); 153891037db265ecdd914a26e056cf69207b4f50924ehkuang v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); 153991037db265ecdd914a26e056cf69207b4f50924ehkuang v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); 154091037db265ecdd914a26e056cf69207b4f50924ehkuang v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); 154191037db265ecdd914a26e056cf69207b4f50924ehkuang v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); 154291037db265ecdd914a26e056cf69207b4f50924ehkuang v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); 154391037db265ecdd914a26e056cf69207b4f50924ehkuang v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); 154491037db265ecdd914a26e056cf69207b4f50924ehkuang v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); 154591037db265ecdd914a26e056cf69207b4f50924ehkuang v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); 154691037db265ecdd914a26e056cf69207b4f50924ehkuang v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); 154791037db265ecdd914a26e056cf69207b4f50924ehkuang v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); 154891037db265ecdd914a26e056cf69207b4f50924ehkuang v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); 154991037db265ecdd914a26e056cf69207b4f50924ehkuang v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); 155091037db265ecdd914a26e056cf69207b4f50924ehkuang v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); 155191037db265ecdd914a26e056cf69207b4f50924ehkuang v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); 155291037db265ecdd914a26e056cf69207b4f50924ehkuang v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); 155391037db265ecdd914a26e056cf69207b4f50924ehkuang 155491037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], v[16]); 155591037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(v[1], v[17]); 155691037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(v[2], v[18]); 155791037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(v[3], v[19]); 155891037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_add_epi32(v[4], v[20]); 155991037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_add_epi32(v[5], v[21]); 156091037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_add_epi32(v[6], v[22]); 156191037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_add_epi32(v[7], v[23]); 156291037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_add_epi32(v[8], v[24]); 156391037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_add_epi32(v[9], v[25]); 156491037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_add_epi32(v[10], v[26]); 156591037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_add_epi32(v[11], v[27]); 156691037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_add_epi32(v[12], v[28]); 156791037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_add_epi32(v[13], v[29]); 156891037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_add_epi32(v[14], v[30]); 156991037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_add_epi32(v[15], v[31]); 157091037db265ecdd914a26e056cf69207b4f50924ehkuang u[16] = _mm_sub_epi32(v[0], v[16]); 157191037db265ecdd914a26e056cf69207b4f50924ehkuang u[17] = _mm_sub_epi32(v[1], v[17]); 157291037db265ecdd914a26e056cf69207b4f50924ehkuang u[18] = _mm_sub_epi32(v[2], v[18]); 157391037db265ecdd914a26e056cf69207b4f50924ehkuang u[19] = _mm_sub_epi32(v[3], v[19]); 157491037db265ecdd914a26e056cf69207b4f50924ehkuang u[20] = _mm_sub_epi32(v[4], v[20]); 157591037db265ecdd914a26e056cf69207b4f50924ehkuang u[21] = _mm_sub_epi32(v[5], v[21]); 157691037db265ecdd914a26e056cf69207b4f50924ehkuang u[22] = _mm_sub_epi32(v[6], v[22]); 157791037db265ecdd914a26e056cf69207b4f50924ehkuang u[23] = _mm_sub_epi32(v[7], v[23]); 157891037db265ecdd914a26e056cf69207b4f50924ehkuang u[24] = _mm_sub_epi32(v[8], v[24]); 157991037db265ecdd914a26e056cf69207b4f50924ehkuang u[25] = _mm_sub_epi32(v[9], v[25]); 158091037db265ecdd914a26e056cf69207b4f50924ehkuang u[26] = _mm_sub_epi32(v[10], v[26]); 158191037db265ecdd914a26e056cf69207b4f50924ehkuang u[27] = _mm_sub_epi32(v[11], v[27]); 158291037db265ecdd914a26e056cf69207b4f50924ehkuang u[28] = _mm_sub_epi32(v[12], v[28]); 158391037db265ecdd914a26e056cf69207b4f50924ehkuang u[29] = _mm_sub_epi32(v[13], v[29]); 158491037db265ecdd914a26e056cf69207b4f50924ehkuang u[30] = _mm_sub_epi32(v[14], v[30]); 158591037db265ecdd914a26e056cf69207b4f50924ehkuang u[31] = _mm_sub_epi32(v[15], v[31]); 158691037db265ecdd914a26e056cf69207b4f50924ehkuang 158791037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 158891037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 158991037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 159091037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 159191037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 159291037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 159391037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 159491037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 159591037db265ecdd914a26e056cf69207b4f50924ehkuang v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 159691037db265ecdd914a26e056cf69207b4f50924ehkuang v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 159791037db265ecdd914a26e056cf69207b4f50924ehkuang v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 159891037db265ecdd914a26e056cf69207b4f50924ehkuang v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 159991037db265ecdd914a26e056cf69207b4f50924ehkuang v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 160091037db265ecdd914a26e056cf69207b4f50924ehkuang v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 160191037db265ecdd914a26e056cf69207b4f50924ehkuang v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 160291037db265ecdd914a26e056cf69207b4f50924ehkuang v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 160391037db265ecdd914a26e056cf69207b4f50924ehkuang v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); 160491037db265ecdd914a26e056cf69207b4f50924ehkuang v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); 160591037db265ecdd914a26e056cf69207b4f50924ehkuang v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); 160691037db265ecdd914a26e056cf69207b4f50924ehkuang v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); 160791037db265ecdd914a26e056cf69207b4f50924ehkuang v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); 160891037db265ecdd914a26e056cf69207b4f50924ehkuang v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); 160991037db265ecdd914a26e056cf69207b4f50924ehkuang v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); 161091037db265ecdd914a26e056cf69207b4f50924ehkuang v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); 161191037db265ecdd914a26e056cf69207b4f50924ehkuang v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); 161291037db265ecdd914a26e056cf69207b4f50924ehkuang v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); 161391037db265ecdd914a26e056cf69207b4f50924ehkuang v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); 161491037db265ecdd914a26e056cf69207b4f50924ehkuang v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); 161591037db265ecdd914a26e056cf69207b4f50924ehkuang v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); 161691037db265ecdd914a26e056cf69207b4f50924ehkuang v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); 161791037db265ecdd914a26e056cf69207b4f50924ehkuang v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); 161891037db265ecdd914a26e056cf69207b4f50924ehkuang v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); 161991037db265ecdd914a26e056cf69207b4f50924ehkuang 162091037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 162191037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 162291037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 162391037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 162491037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 162591037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 162691037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 162791037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 162891037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 162991037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 163091037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 163191037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 163291037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 163391037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 163491037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 163591037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 163691037db265ecdd914a26e056cf69207b4f50924ehkuang u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); 163791037db265ecdd914a26e056cf69207b4f50924ehkuang u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); 163891037db265ecdd914a26e056cf69207b4f50924ehkuang u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); 163991037db265ecdd914a26e056cf69207b4f50924ehkuang u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); 164091037db265ecdd914a26e056cf69207b4f50924ehkuang u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); 164191037db265ecdd914a26e056cf69207b4f50924ehkuang u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); 164291037db265ecdd914a26e056cf69207b4f50924ehkuang u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); 164391037db265ecdd914a26e056cf69207b4f50924ehkuang u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); 164491037db265ecdd914a26e056cf69207b4f50924ehkuang u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); 164591037db265ecdd914a26e056cf69207b4f50924ehkuang u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); 164691037db265ecdd914a26e056cf69207b4f50924ehkuang u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); 164791037db265ecdd914a26e056cf69207b4f50924ehkuang u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); 164891037db265ecdd914a26e056cf69207b4f50924ehkuang u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); 164991037db265ecdd914a26e056cf69207b4f50924ehkuang u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); 165091037db265ecdd914a26e056cf69207b4f50924ehkuang u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); 165191037db265ecdd914a26e056cf69207b4f50924ehkuang u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); 165291037db265ecdd914a26e056cf69207b4f50924ehkuang 165391037db265ecdd914a26e056cf69207b4f50924ehkuang s[0] = _mm_packs_epi32(u[0], u[1]); 165491037db265ecdd914a26e056cf69207b4f50924ehkuang s[1] = _mm_packs_epi32(u[2], u[3]); 165591037db265ecdd914a26e056cf69207b4f50924ehkuang s[2] = _mm_packs_epi32(u[4], u[5]); 165691037db265ecdd914a26e056cf69207b4f50924ehkuang s[3] = _mm_packs_epi32(u[6], u[7]); 165791037db265ecdd914a26e056cf69207b4f50924ehkuang s[4] = _mm_packs_epi32(u[8], u[9]); 165891037db265ecdd914a26e056cf69207b4f50924ehkuang s[5] = _mm_packs_epi32(u[10], u[11]); 165991037db265ecdd914a26e056cf69207b4f50924ehkuang s[6] = _mm_packs_epi32(u[12], u[13]); 166091037db265ecdd914a26e056cf69207b4f50924ehkuang s[7] = _mm_packs_epi32(u[14], u[15]); 166191037db265ecdd914a26e056cf69207b4f50924ehkuang s[8] = _mm_packs_epi32(u[16], u[17]); 166291037db265ecdd914a26e056cf69207b4f50924ehkuang s[9] = _mm_packs_epi32(u[18], u[19]); 166391037db265ecdd914a26e056cf69207b4f50924ehkuang s[10] = _mm_packs_epi32(u[20], u[21]); 166491037db265ecdd914a26e056cf69207b4f50924ehkuang s[11] = _mm_packs_epi32(u[22], u[23]); 166591037db265ecdd914a26e056cf69207b4f50924ehkuang s[12] = _mm_packs_epi32(u[24], u[25]); 166691037db265ecdd914a26e056cf69207b4f50924ehkuang s[13] = _mm_packs_epi32(u[26], u[27]); 166791037db265ecdd914a26e056cf69207b4f50924ehkuang s[14] = _mm_packs_epi32(u[28], u[29]); 166891037db265ecdd914a26e056cf69207b4f50924ehkuang s[15] = _mm_packs_epi32(u[30], u[31]); 166991037db265ecdd914a26e056cf69207b4f50924ehkuang 167091037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 2 167191037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_unpacklo_epi16(s[8], s[9]); 167291037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_unpackhi_epi16(s[8], s[9]); 167391037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_unpacklo_epi16(s[10], s[11]); 167491037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_unpackhi_epi16(s[10], s[11]); 167591037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_unpacklo_epi16(s[12], s[13]); 167691037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_unpackhi_epi16(s[12], s[13]); 167791037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_unpacklo_epi16(s[14], s[15]); 167891037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_unpackhi_epi16(s[14], s[15]); 167991037db265ecdd914a26e056cf69207b4f50924ehkuang 168091037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); 168191037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); 168291037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); 168391037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); 168491037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); 168591037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); 168691037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); 168791037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); 168891037db265ecdd914a26e056cf69207b4f50924ehkuang v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); 168991037db265ecdd914a26e056cf69207b4f50924ehkuang v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); 169091037db265ecdd914a26e056cf69207b4f50924ehkuang v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); 169191037db265ecdd914a26e056cf69207b4f50924ehkuang v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); 169291037db265ecdd914a26e056cf69207b4f50924ehkuang v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); 169391037db265ecdd914a26e056cf69207b4f50924ehkuang v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); 169491037db265ecdd914a26e056cf69207b4f50924ehkuang v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); 169591037db265ecdd914a26e056cf69207b4f50924ehkuang v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); 169691037db265ecdd914a26e056cf69207b4f50924ehkuang 169791037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], v[8]); 169891037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(v[1], v[9]); 169991037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(v[2], v[10]); 170091037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(v[3], v[11]); 170191037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_add_epi32(v[4], v[12]); 170291037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_add_epi32(v[5], v[13]); 170391037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_add_epi32(v[6], v[14]); 170491037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_add_epi32(v[7], v[15]); 170591037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_sub_epi32(v[0], v[8]); 170691037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_sub_epi32(v[1], v[9]); 170791037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_sub_epi32(v[2], v[10]); 170891037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_sub_epi32(v[3], v[11]); 170991037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_sub_epi32(v[4], v[12]); 171091037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_sub_epi32(v[5], v[13]); 171191037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_sub_epi32(v[6], v[14]); 171291037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_sub_epi32(v[7], v[15]); 171391037db265ecdd914a26e056cf69207b4f50924ehkuang 171491037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 171591037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 171691037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 171791037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 171891037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 171991037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 172091037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 172191037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 172291037db265ecdd914a26e056cf69207b4f50924ehkuang v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 172391037db265ecdd914a26e056cf69207b4f50924ehkuang v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 172491037db265ecdd914a26e056cf69207b4f50924ehkuang v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 172591037db265ecdd914a26e056cf69207b4f50924ehkuang v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 172691037db265ecdd914a26e056cf69207b4f50924ehkuang v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 172791037db265ecdd914a26e056cf69207b4f50924ehkuang v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 172891037db265ecdd914a26e056cf69207b4f50924ehkuang v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 172991037db265ecdd914a26e056cf69207b4f50924ehkuang v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 173091037db265ecdd914a26e056cf69207b4f50924ehkuang 173191037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 173291037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 173391037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 173491037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 173591037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 173691037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 173791037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 173891037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 173991037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 174091037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 174191037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 174291037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 174391037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 174491037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 174591037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 174691037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 174791037db265ecdd914a26e056cf69207b4f50924ehkuang 174891037db265ecdd914a26e056cf69207b4f50924ehkuang x[0] = _mm_add_epi16(s[0], s[4]); 174991037db265ecdd914a26e056cf69207b4f50924ehkuang x[1] = _mm_add_epi16(s[1], s[5]); 175091037db265ecdd914a26e056cf69207b4f50924ehkuang x[2] = _mm_add_epi16(s[2], s[6]); 175191037db265ecdd914a26e056cf69207b4f50924ehkuang x[3] = _mm_add_epi16(s[3], s[7]); 175291037db265ecdd914a26e056cf69207b4f50924ehkuang x[4] = _mm_sub_epi16(s[0], s[4]); 175391037db265ecdd914a26e056cf69207b4f50924ehkuang x[5] = _mm_sub_epi16(s[1], s[5]); 175491037db265ecdd914a26e056cf69207b4f50924ehkuang x[6] = _mm_sub_epi16(s[2], s[6]); 175591037db265ecdd914a26e056cf69207b4f50924ehkuang x[7] = _mm_sub_epi16(s[3], s[7]); 175691037db265ecdd914a26e056cf69207b4f50924ehkuang x[8] = _mm_packs_epi32(u[0], u[1]); 175791037db265ecdd914a26e056cf69207b4f50924ehkuang x[9] = _mm_packs_epi32(u[2], u[3]); 175891037db265ecdd914a26e056cf69207b4f50924ehkuang x[10] = _mm_packs_epi32(u[4], u[5]); 175991037db265ecdd914a26e056cf69207b4f50924ehkuang x[11] = _mm_packs_epi32(u[6], u[7]); 176091037db265ecdd914a26e056cf69207b4f50924ehkuang x[12] = _mm_packs_epi32(u[8], u[9]); 176191037db265ecdd914a26e056cf69207b4f50924ehkuang x[13] = _mm_packs_epi32(u[10], u[11]); 176291037db265ecdd914a26e056cf69207b4f50924ehkuang x[14] = _mm_packs_epi32(u[12], u[13]); 176391037db265ecdd914a26e056cf69207b4f50924ehkuang x[15] = _mm_packs_epi32(u[14], u[15]); 176491037db265ecdd914a26e056cf69207b4f50924ehkuang 176591037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 3 176691037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_unpacklo_epi16(x[4], x[5]); 176791037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_unpackhi_epi16(x[4], x[5]); 176891037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_unpacklo_epi16(x[6], x[7]); 176991037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_unpackhi_epi16(x[6], x[7]); 177091037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_unpacklo_epi16(x[12], x[13]); 177191037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_unpackhi_epi16(x[12], x[13]); 177291037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_unpacklo_epi16(x[14], x[15]); 177391037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_unpackhi_epi16(x[14], x[15]); 177491037db265ecdd914a26e056cf69207b4f50924ehkuang 177591037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); 177691037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); 177791037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); 177891037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); 177991037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); 178091037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); 178191037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); 178291037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); 178391037db265ecdd914a26e056cf69207b4f50924ehkuang v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); 178491037db265ecdd914a26e056cf69207b4f50924ehkuang v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); 178591037db265ecdd914a26e056cf69207b4f50924ehkuang v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); 178691037db265ecdd914a26e056cf69207b4f50924ehkuang v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); 178791037db265ecdd914a26e056cf69207b4f50924ehkuang v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); 178891037db265ecdd914a26e056cf69207b4f50924ehkuang v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); 178991037db265ecdd914a26e056cf69207b4f50924ehkuang v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); 179091037db265ecdd914a26e056cf69207b4f50924ehkuang v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); 179191037db265ecdd914a26e056cf69207b4f50924ehkuang 179291037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], v[4]); 179391037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(v[1], v[5]); 179491037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(v[2], v[6]); 179591037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(v[3], v[7]); 179691037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_sub_epi32(v[0], v[4]); 179791037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_sub_epi32(v[1], v[5]); 179891037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_sub_epi32(v[2], v[6]); 179991037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_sub_epi32(v[3], v[7]); 180091037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_add_epi32(v[8], v[12]); 180191037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_add_epi32(v[9], v[13]); 180291037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_add_epi32(v[10], v[14]); 180391037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_add_epi32(v[11], v[15]); 180491037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_sub_epi32(v[8], v[12]); 180591037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_sub_epi32(v[9], v[13]); 180691037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_sub_epi32(v[10], v[14]); 180791037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_sub_epi32(v[11], v[15]); 180891037db265ecdd914a26e056cf69207b4f50924ehkuang 180991037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 181091037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 181191037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 181291037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 181391037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 181491037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 181591037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 181691037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 181791037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 181891037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 181991037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 182091037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 182191037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 182291037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 182391037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 182491037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 182591037db265ecdd914a26e056cf69207b4f50924ehkuang 182691037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 182791037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 182891037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 182991037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 183091037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 183191037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 183291037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 183391037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 183491037db265ecdd914a26e056cf69207b4f50924ehkuang v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 183591037db265ecdd914a26e056cf69207b4f50924ehkuang v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 183691037db265ecdd914a26e056cf69207b4f50924ehkuang v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 183791037db265ecdd914a26e056cf69207b4f50924ehkuang v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 183891037db265ecdd914a26e056cf69207b4f50924ehkuang v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 183991037db265ecdd914a26e056cf69207b4f50924ehkuang v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 184091037db265ecdd914a26e056cf69207b4f50924ehkuang v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 184191037db265ecdd914a26e056cf69207b4f50924ehkuang v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 184291037db265ecdd914a26e056cf69207b4f50924ehkuang 184391037db265ecdd914a26e056cf69207b4f50924ehkuang s[0] = _mm_add_epi16(x[0], x[2]); 184491037db265ecdd914a26e056cf69207b4f50924ehkuang s[1] = _mm_add_epi16(x[1], x[3]); 184591037db265ecdd914a26e056cf69207b4f50924ehkuang s[2] = _mm_sub_epi16(x[0], x[2]); 184691037db265ecdd914a26e056cf69207b4f50924ehkuang s[3] = _mm_sub_epi16(x[1], x[3]); 184791037db265ecdd914a26e056cf69207b4f50924ehkuang s[4] = _mm_packs_epi32(v[0], v[1]); 184891037db265ecdd914a26e056cf69207b4f50924ehkuang s[5] = _mm_packs_epi32(v[2], v[3]); 184991037db265ecdd914a26e056cf69207b4f50924ehkuang s[6] = _mm_packs_epi32(v[4], v[5]); 185091037db265ecdd914a26e056cf69207b4f50924ehkuang s[7] = _mm_packs_epi32(v[6], v[7]); 185191037db265ecdd914a26e056cf69207b4f50924ehkuang s[8] = _mm_add_epi16(x[8], x[10]); 185291037db265ecdd914a26e056cf69207b4f50924ehkuang s[9] = _mm_add_epi16(x[9], x[11]); 185391037db265ecdd914a26e056cf69207b4f50924ehkuang s[10] = _mm_sub_epi16(x[8], x[10]); 185491037db265ecdd914a26e056cf69207b4f50924ehkuang s[11] = _mm_sub_epi16(x[9], x[11]); 185591037db265ecdd914a26e056cf69207b4f50924ehkuang s[12] = _mm_packs_epi32(v[8], v[9]); 185691037db265ecdd914a26e056cf69207b4f50924ehkuang s[13] = _mm_packs_epi32(v[10], v[11]); 185791037db265ecdd914a26e056cf69207b4f50924ehkuang s[14] = _mm_packs_epi32(v[12], v[13]); 185891037db265ecdd914a26e056cf69207b4f50924ehkuang s[15] = _mm_packs_epi32(v[14], v[15]); 185991037db265ecdd914a26e056cf69207b4f50924ehkuang 186091037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 4 186191037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_unpacklo_epi16(s[2], s[3]); 186291037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_unpackhi_epi16(s[2], s[3]); 186391037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_unpacklo_epi16(s[6], s[7]); 186491037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_unpackhi_epi16(s[6], s[7]); 186591037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_unpacklo_epi16(s[10], s[11]); 186691037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_unpackhi_epi16(s[10], s[11]); 186791037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_unpacklo_epi16(s[14], s[15]); 186891037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_unpackhi_epi16(s[14], s[15]); 186991037db265ecdd914a26e056cf69207b4f50924ehkuang 187091037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); 187191037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); 187291037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 187391037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); 187491037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 187591037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 187691037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 187791037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 187891037db265ecdd914a26e056cf69207b4f50924ehkuang v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); 187991037db265ecdd914a26e056cf69207b4f50924ehkuang v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); 188091037db265ecdd914a26e056cf69207b4f50924ehkuang v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); 188191037db265ecdd914a26e056cf69207b4f50924ehkuang v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); 188291037db265ecdd914a26e056cf69207b4f50924ehkuang v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); 188391037db265ecdd914a26e056cf69207b4f50924ehkuang v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); 188491037db265ecdd914a26e056cf69207b4f50924ehkuang v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); 188591037db265ecdd914a26e056cf69207b4f50924ehkuang v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); 188691037db265ecdd914a26e056cf69207b4f50924ehkuang 188791037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 188891037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 188991037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 189091037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 189191037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 189291037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 189391037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 189491037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 189591037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 189691037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 189791037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 189891037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 189991037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 190091037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 190191037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 190291037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 190391037db265ecdd914a26e056cf69207b4f50924ehkuang 190491037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 190591037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 190691037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 190791037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 190891037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 190991037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 191091037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 191191037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 191291037db265ecdd914a26e056cf69207b4f50924ehkuang v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 191391037db265ecdd914a26e056cf69207b4f50924ehkuang v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 191491037db265ecdd914a26e056cf69207b4f50924ehkuang v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 191591037db265ecdd914a26e056cf69207b4f50924ehkuang v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 191691037db265ecdd914a26e056cf69207b4f50924ehkuang v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 191791037db265ecdd914a26e056cf69207b4f50924ehkuang v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 191891037db265ecdd914a26e056cf69207b4f50924ehkuang v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 191991037db265ecdd914a26e056cf69207b4f50924ehkuang v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 192091037db265ecdd914a26e056cf69207b4f50924ehkuang 192191037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = s[0]; 192291037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_sub_epi16(kZero, s[8]); 192391037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = s[12]; 192491037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_sub_epi16(kZero, s[4]); 192591037db265ecdd914a26e056cf69207b4f50924ehkuang in[4] = _mm_packs_epi32(v[4], v[5]); 192691037db265ecdd914a26e056cf69207b4f50924ehkuang in[5] = _mm_packs_epi32(v[12], v[13]); 192791037db265ecdd914a26e056cf69207b4f50924ehkuang in[6] = _mm_packs_epi32(v[8], v[9]); 192891037db265ecdd914a26e056cf69207b4f50924ehkuang in[7] = _mm_packs_epi32(v[0], v[1]); 192991037db265ecdd914a26e056cf69207b4f50924ehkuang in[8] = _mm_packs_epi32(v[2], v[3]); 193091037db265ecdd914a26e056cf69207b4f50924ehkuang in[9] = _mm_packs_epi32(v[10], v[11]); 193191037db265ecdd914a26e056cf69207b4f50924ehkuang in[10] = _mm_packs_epi32(v[14], v[15]); 193291037db265ecdd914a26e056cf69207b4f50924ehkuang in[11] = _mm_packs_epi32(v[6], v[7]); 193391037db265ecdd914a26e056cf69207b4f50924ehkuang in[12] = s[5]; 193491037db265ecdd914a26e056cf69207b4f50924ehkuang in[13] = _mm_sub_epi16(kZero, s[13]); 193591037db265ecdd914a26e056cf69207b4f50924ehkuang in[14] = s[9]; 193691037db265ecdd914a26e056cf69207b4f50924ehkuang in[15] = _mm_sub_epi16(kZero, s[1]); 193791037db265ecdd914a26e056cf69207b4f50924ehkuang} 193891037db265ecdd914a26e056cf69207b4f50924ehkuang 193991037db265ecdd914a26e056cf69207b4f50924ehkuangvoid idct16_1d_8col(__m128i *in) { 194091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); 194191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); 194291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); 194391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); 194491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); 194591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); 194691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); 194791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); 194891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); 194991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); 195091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); 195191037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); 195291037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 195391037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 195491037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 195591037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 195691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 195791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 195891037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 195991037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 196091037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 196191037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i v[16], u[16], s[16], t[16]; 196291037db265ecdd914a26e056cf69207b4f50924ehkuang 196391037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 1 196491037db265ecdd914a26e056cf69207b4f50924ehkuang s[0] = in[0]; 196591037db265ecdd914a26e056cf69207b4f50924ehkuang s[1] = in[8]; 196691037db265ecdd914a26e056cf69207b4f50924ehkuang s[2] = in[4]; 196791037db265ecdd914a26e056cf69207b4f50924ehkuang s[3] = in[12]; 196891037db265ecdd914a26e056cf69207b4f50924ehkuang s[4] = in[2]; 196991037db265ecdd914a26e056cf69207b4f50924ehkuang s[5] = in[10]; 197091037db265ecdd914a26e056cf69207b4f50924ehkuang s[6] = in[6]; 197191037db265ecdd914a26e056cf69207b4f50924ehkuang s[7] = in[14]; 197291037db265ecdd914a26e056cf69207b4f50924ehkuang s[8] = in[1]; 197391037db265ecdd914a26e056cf69207b4f50924ehkuang s[9] = in[9]; 197491037db265ecdd914a26e056cf69207b4f50924ehkuang s[10] = in[5]; 197591037db265ecdd914a26e056cf69207b4f50924ehkuang s[11] = in[13]; 197691037db265ecdd914a26e056cf69207b4f50924ehkuang s[12] = in[3]; 197791037db265ecdd914a26e056cf69207b4f50924ehkuang s[13] = in[11]; 197891037db265ecdd914a26e056cf69207b4f50924ehkuang s[14] = in[7]; 197991037db265ecdd914a26e056cf69207b4f50924ehkuang s[15] = in[15]; 198091037db265ecdd914a26e056cf69207b4f50924ehkuang 198191037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 2 198291037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_unpacklo_epi16(s[8], s[15]); 198391037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_unpackhi_epi16(s[8], s[15]); 198491037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_unpacklo_epi16(s[9], s[14]); 198591037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_unpackhi_epi16(s[9], s[14]); 198691037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_unpacklo_epi16(s[10], s[13]); 198791037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_unpackhi_epi16(s[10], s[13]); 198891037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_unpacklo_epi16(s[11], s[12]); 198991037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_unpackhi_epi16(s[11], s[12]); 199091037db265ecdd914a26e056cf69207b4f50924ehkuang 199191037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02); 199291037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02); 199391037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30); 199491037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30); 199591037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18); 199691037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18); 199791037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14); 199891037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14); 199991037db265ecdd914a26e056cf69207b4f50924ehkuang v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10); 200091037db265ecdd914a26e056cf69207b4f50924ehkuang v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10); 200191037db265ecdd914a26e056cf69207b4f50924ehkuang v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22); 200291037db265ecdd914a26e056cf69207b4f50924ehkuang v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22); 200391037db265ecdd914a26e056cf69207b4f50924ehkuang v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26); 200491037db265ecdd914a26e056cf69207b4f50924ehkuang v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26); 200591037db265ecdd914a26e056cf69207b4f50924ehkuang v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06); 200691037db265ecdd914a26e056cf69207b4f50924ehkuang v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06); 200791037db265ecdd914a26e056cf69207b4f50924ehkuang 200891037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 200991037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 201091037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 201191037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 201291037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 201391037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 201491037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 201591037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 201691037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 201791037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 201891037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 201991037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 202091037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 202191037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 202291037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 202391037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 202491037db265ecdd914a26e056cf69207b4f50924ehkuang 202591037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 202691037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 202791037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 202891037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 202991037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 203091037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 203191037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 203291037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 203391037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 203491037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 203591037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 203691037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 203791037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 203891037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 203991037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 204091037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 204191037db265ecdd914a26e056cf69207b4f50924ehkuang 204291037db265ecdd914a26e056cf69207b4f50924ehkuang s[8] = _mm_packs_epi32(u[0], u[1]); 204391037db265ecdd914a26e056cf69207b4f50924ehkuang s[15] = _mm_packs_epi32(u[2], u[3]); 204491037db265ecdd914a26e056cf69207b4f50924ehkuang s[9] = _mm_packs_epi32(u[4], u[5]); 204591037db265ecdd914a26e056cf69207b4f50924ehkuang s[14] = _mm_packs_epi32(u[6], u[7]); 204691037db265ecdd914a26e056cf69207b4f50924ehkuang s[10] = _mm_packs_epi32(u[8], u[9]); 204791037db265ecdd914a26e056cf69207b4f50924ehkuang s[13] = _mm_packs_epi32(u[10], u[11]); 204891037db265ecdd914a26e056cf69207b4f50924ehkuang s[11] = _mm_packs_epi32(u[12], u[13]); 204991037db265ecdd914a26e056cf69207b4f50924ehkuang s[12] = _mm_packs_epi32(u[14], u[15]); 205091037db265ecdd914a26e056cf69207b4f50924ehkuang 205191037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 3 205291037db265ecdd914a26e056cf69207b4f50924ehkuang t[0] = s[0]; 205391037db265ecdd914a26e056cf69207b4f50924ehkuang t[1] = s[1]; 205491037db265ecdd914a26e056cf69207b4f50924ehkuang t[2] = s[2]; 205591037db265ecdd914a26e056cf69207b4f50924ehkuang t[3] = s[3]; 205691037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_unpacklo_epi16(s[4], s[7]); 205791037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_unpackhi_epi16(s[4], s[7]); 205891037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_unpacklo_epi16(s[5], s[6]); 205991037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_unpackhi_epi16(s[5], s[6]); 206091037db265ecdd914a26e056cf69207b4f50924ehkuang 206191037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04); 206291037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04); 206391037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28); 206491037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28); 206591037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20); 206691037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20); 206791037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12); 206891037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12); 206991037db265ecdd914a26e056cf69207b4f50924ehkuang 207091037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 207191037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 207291037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 207391037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 207491037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 207591037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 207691037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 207791037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 207891037db265ecdd914a26e056cf69207b4f50924ehkuang 207991037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 208091037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 208191037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 208291037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 208391037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 208491037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 208591037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 208691037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 208791037db265ecdd914a26e056cf69207b4f50924ehkuang 208891037db265ecdd914a26e056cf69207b4f50924ehkuang t[4] = _mm_packs_epi32(u[0], u[1]); 208991037db265ecdd914a26e056cf69207b4f50924ehkuang t[7] = _mm_packs_epi32(u[2], u[3]); 209091037db265ecdd914a26e056cf69207b4f50924ehkuang t[5] = _mm_packs_epi32(u[4], u[5]); 209191037db265ecdd914a26e056cf69207b4f50924ehkuang t[6] = _mm_packs_epi32(u[6], u[7]); 209291037db265ecdd914a26e056cf69207b4f50924ehkuang t[8] = _mm_add_epi16(s[8], s[9]); 209391037db265ecdd914a26e056cf69207b4f50924ehkuang t[9] = _mm_sub_epi16(s[8], s[9]); 209491037db265ecdd914a26e056cf69207b4f50924ehkuang t[10] = _mm_sub_epi16(s[11], s[10]); 209591037db265ecdd914a26e056cf69207b4f50924ehkuang t[11] = _mm_add_epi16(s[10], s[11]); 209691037db265ecdd914a26e056cf69207b4f50924ehkuang t[12] = _mm_add_epi16(s[12], s[13]); 209791037db265ecdd914a26e056cf69207b4f50924ehkuang t[13] = _mm_sub_epi16(s[12], s[13]); 209891037db265ecdd914a26e056cf69207b4f50924ehkuang t[14] = _mm_sub_epi16(s[15], s[14]); 209991037db265ecdd914a26e056cf69207b4f50924ehkuang t[15] = _mm_add_epi16(s[14], s[15]); 210091037db265ecdd914a26e056cf69207b4f50924ehkuang 210191037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 4 210291037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_unpacklo_epi16(t[0], t[1]); 210391037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_unpackhi_epi16(t[0], t[1]); 210491037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_unpacklo_epi16(t[2], t[3]); 210591037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_unpackhi_epi16(t[2], t[3]); 210691037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_unpacklo_epi16(t[9], t[14]); 210791037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_unpackhi_epi16(t[9], t[14]); 210891037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_unpacklo_epi16(t[10], t[13]); 210991037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_unpackhi_epi16(t[10], t[13]); 211091037db265ecdd914a26e056cf69207b4f50924ehkuang 211191037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 211291037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 211391037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 211491037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); 211591037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08); 211691037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08); 211791037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); 211891037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); 211991037db265ecdd914a26e056cf69207b4f50924ehkuang v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24); 212091037db265ecdd914a26e056cf69207b4f50924ehkuang v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24); 212191037db265ecdd914a26e056cf69207b4f50924ehkuang v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08); 212291037db265ecdd914a26e056cf69207b4f50924ehkuang v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08); 212391037db265ecdd914a26e056cf69207b4f50924ehkuang v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08); 212491037db265ecdd914a26e056cf69207b4f50924ehkuang v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08); 212591037db265ecdd914a26e056cf69207b4f50924ehkuang v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24); 212691037db265ecdd914a26e056cf69207b4f50924ehkuang v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24); 212791037db265ecdd914a26e056cf69207b4f50924ehkuang 212891037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 212991037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 213091037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 213191037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 213291037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 213391037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 213491037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 213591037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 213691037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 213791037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 213891037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 213991037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 214091037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 214191037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 214291037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 214391037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 214491037db265ecdd914a26e056cf69207b4f50924ehkuang 214591037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 214691037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 214791037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 214891037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 214991037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 215091037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 215191037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 215291037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 215391037db265ecdd914a26e056cf69207b4f50924ehkuang u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 215491037db265ecdd914a26e056cf69207b4f50924ehkuang u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 215591037db265ecdd914a26e056cf69207b4f50924ehkuang u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 215691037db265ecdd914a26e056cf69207b4f50924ehkuang u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 215791037db265ecdd914a26e056cf69207b4f50924ehkuang u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 215891037db265ecdd914a26e056cf69207b4f50924ehkuang u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 215991037db265ecdd914a26e056cf69207b4f50924ehkuang u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 216091037db265ecdd914a26e056cf69207b4f50924ehkuang u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 216191037db265ecdd914a26e056cf69207b4f50924ehkuang 216291037db265ecdd914a26e056cf69207b4f50924ehkuang s[0] = _mm_packs_epi32(u[0], u[1]); 216391037db265ecdd914a26e056cf69207b4f50924ehkuang s[1] = _mm_packs_epi32(u[2], u[3]); 216491037db265ecdd914a26e056cf69207b4f50924ehkuang s[2] = _mm_packs_epi32(u[4], u[5]); 216591037db265ecdd914a26e056cf69207b4f50924ehkuang s[3] = _mm_packs_epi32(u[6], u[7]); 216691037db265ecdd914a26e056cf69207b4f50924ehkuang s[4] = _mm_add_epi16(t[4], t[5]); 216791037db265ecdd914a26e056cf69207b4f50924ehkuang s[5] = _mm_sub_epi16(t[4], t[5]); 216891037db265ecdd914a26e056cf69207b4f50924ehkuang s[6] = _mm_sub_epi16(t[7], t[6]); 216991037db265ecdd914a26e056cf69207b4f50924ehkuang s[7] = _mm_add_epi16(t[6], t[7]); 217091037db265ecdd914a26e056cf69207b4f50924ehkuang s[8] = t[8]; 217191037db265ecdd914a26e056cf69207b4f50924ehkuang s[15] = t[15]; 217291037db265ecdd914a26e056cf69207b4f50924ehkuang s[9] = _mm_packs_epi32(u[8], u[9]); 217391037db265ecdd914a26e056cf69207b4f50924ehkuang s[14] = _mm_packs_epi32(u[10], u[11]); 217491037db265ecdd914a26e056cf69207b4f50924ehkuang s[10] = _mm_packs_epi32(u[12], u[13]); 217591037db265ecdd914a26e056cf69207b4f50924ehkuang s[13] = _mm_packs_epi32(u[14], u[15]); 217691037db265ecdd914a26e056cf69207b4f50924ehkuang s[11] = t[11]; 217791037db265ecdd914a26e056cf69207b4f50924ehkuang s[12] = t[12]; 217891037db265ecdd914a26e056cf69207b4f50924ehkuang 217991037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 5 218091037db265ecdd914a26e056cf69207b4f50924ehkuang t[0] = _mm_add_epi16(s[0], s[3]); 218191037db265ecdd914a26e056cf69207b4f50924ehkuang t[1] = _mm_add_epi16(s[1], s[2]); 218291037db265ecdd914a26e056cf69207b4f50924ehkuang t[2] = _mm_sub_epi16(s[1], s[2]); 218391037db265ecdd914a26e056cf69207b4f50924ehkuang t[3] = _mm_sub_epi16(s[0], s[3]); 218491037db265ecdd914a26e056cf69207b4f50924ehkuang t[4] = s[4]; 218591037db265ecdd914a26e056cf69207b4f50924ehkuang t[7] = s[7]; 218691037db265ecdd914a26e056cf69207b4f50924ehkuang 218791037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_unpacklo_epi16(s[5], s[6]); 218891037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_unpackhi_epi16(s[5], s[6]); 218991037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 219091037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 219191037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 219291037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 219391037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 219491037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 219591037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 219691037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 219791037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 219891037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 219991037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 220091037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 220191037db265ecdd914a26e056cf69207b4f50924ehkuang t[5] = _mm_packs_epi32(u[0], u[1]); 220291037db265ecdd914a26e056cf69207b4f50924ehkuang t[6] = _mm_packs_epi32(u[2], u[3]); 220391037db265ecdd914a26e056cf69207b4f50924ehkuang 220491037db265ecdd914a26e056cf69207b4f50924ehkuang t[8] = _mm_add_epi16(s[8], s[11]); 220591037db265ecdd914a26e056cf69207b4f50924ehkuang t[9] = _mm_add_epi16(s[9], s[10]); 220691037db265ecdd914a26e056cf69207b4f50924ehkuang t[10] = _mm_sub_epi16(s[9], s[10]); 220791037db265ecdd914a26e056cf69207b4f50924ehkuang t[11] = _mm_sub_epi16(s[8], s[11]); 220891037db265ecdd914a26e056cf69207b4f50924ehkuang t[12] = _mm_sub_epi16(s[15], s[12]); 220991037db265ecdd914a26e056cf69207b4f50924ehkuang t[13] = _mm_sub_epi16(s[14], s[13]); 221091037db265ecdd914a26e056cf69207b4f50924ehkuang t[14] = _mm_add_epi16(s[13], s[14]); 221191037db265ecdd914a26e056cf69207b4f50924ehkuang t[15] = _mm_add_epi16(s[12], s[15]); 221291037db265ecdd914a26e056cf69207b4f50924ehkuang 221391037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 6 221491037db265ecdd914a26e056cf69207b4f50924ehkuang s[0] = _mm_add_epi16(t[0], t[7]); 221591037db265ecdd914a26e056cf69207b4f50924ehkuang s[1] = _mm_add_epi16(t[1], t[6]); 221691037db265ecdd914a26e056cf69207b4f50924ehkuang s[2] = _mm_add_epi16(t[2], t[5]); 221791037db265ecdd914a26e056cf69207b4f50924ehkuang s[3] = _mm_add_epi16(t[3], t[4]); 221891037db265ecdd914a26e056cf69207b4f50924ehkuang s[4] = _mm_sub_epi16(t[3], t[4]); 221991037db265ecdd914a26e056cf69207b4f50924ehkuang s[5] = _mm_sub_epi16(t[2], t[5]); 222091037db265ecdd914a26e056cf69207b4f50924ehkuang s[6] = _mm_sub_epi16(t[1], t[6]); 222191037db265ecdd914a26e056cf69207b4f50924ehkuang s[7] = _mm_sub_epi16(t[0], t[7]); 222291037db265ecdd914a26e056cf69207b4f50924ehkuang s[8] = t[8]; 222391037db265ecdd914a26e056cf69207b4f50924ehkuang s[9] = t[9]; 222491037db265ecdd914a26e056cf69207b4f50924ehkuang 222591037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_unpacklo_epi16(t[10], t[13]); 222691037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_unpackhi_epi16(t[10], t[13]); 222791037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_unpacklo_epi16(t[11], t[12]); 222891037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_unpackhi_epi16(t[11], t[12]); 222991037db265ecdd914a26e056cf69207b4f50924ehkuang 223091037db265ecdd914a26e056cf69207b4f50924ehkuang v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 223191037db265ecdd914a26e056cf69207b4f50924ehkuang v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 223291037db265ecdd914a26e056cf69207b4f50924ehkuang v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 223391037db265ecdd914a26e056cf69207b4f50924ehkuang v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 223491037db265ecdd914a26e056cf69207b4f50924ehkuang v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 223591037db265ecdd914a26e056cf69207b4f50924ehkuang v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 223691037db265ecdd914a26e056cf69207b4f50924ehkuang v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 223791037db265ecdd914a26e056cf69207b4f50924ehkuang v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 223891037db265ecdd914a26e056cf69207b4f50924ehkuang 223991037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 224091037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 224191037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 224291037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 224391037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 224491037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 224591037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 224691037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 224791037db265ecdd914a26e056cf69207b4f50924ehkuang 224891037db265ecdd914a26e056cf69207b4f50924ehkuang u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 224991037db265ecdd914a26e056cf69207b4f50924ehkuang u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 225091037db265ecdd914a26e056cf69207b4f50924ehkuang u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 225191037db265ecdd914a26e056cf69207b4f50924ehkuang u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 225291037db265ecdd914a26e056cf69207b4f50924ehkuang u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 225391037db265ecdd914a26e056cf69207b4f50924ehkuang u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 225491037db265ecdd914a26e056cf69207b4f50924ehkuang u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 225591037db265ecdd914a26e056cf69207b4f50924ehkuang u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 225691037db265ecdd914a26e056cf69207b4f50924ehkuang 225791037db265ecdd914a26e056cf69207b4f50924ehkuang s[10] = _mm_packs_epi32(u[0], u[1]); 225891037db265ecdd914a26e056cf69207b4f50924ehkuang s[13] = _mm_packs_epi32(u[2], u[3]); 225991037db265ecdd914a26e056cf69207b4f50924ehkuang s[11] = _mm_packs_epi32(u[4], u[5]); 226091037db265ecdd914a26e056cf69207b4f50924ehkuang s[12] = _mm_packs_epi32(u[6], u[7]); 226191037db265ecdd914a26e056cf69207b4f50924ehkuang s[14] = t[14]; 226291037db265ecdd914a26e056cf69207b4f50924ehkuang s[15] = t[15]; 226391037db265ecdd914a26e056cf69207b4f50924ehkuang 226491037db265ecdd914a26e056cf69207b4f50924ehkuang // stage 7 226591037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_add_epi16(s[0], s[15]); 226691037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_add_epi16(s[1], s[14]); 226791037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = _mm_add_epi16(s[2], s[13]); 226891037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_add_epi16(s[3], s[12]); 226991037db265ecdd914a26e056cf69207b4f50924ehkuang in[4] = _mm_add_epi16(s[4], s[11]); 227091037db265ecdd914a26e056cf69207b4f50924ehkuang in[5] = _mm_add_epi16(s[5], s[10]); 227191037db265ecdd914a26e056cf69207b4f50924ehkuang in[6] = _mm_add_epi16(s[6], s[9]); 227291037db265ecdd914a26e056cf69207b4f50924ehkuang in[7] = _mm_add_epi16(s[7], s[8]); 227391037db265ecdd914a26e056cf69207b4f50924ehkuang in[8] = _mm_sub_epi16(s[7], s[8]); 227491037db265ecdd914a26e056cf69207b4f50924ehkuang in[9] = _mm_sub_epi16(s[6], s[9]); 227591037db265ecdd914a26e056cf69207b4f50924ehkuang in[10] = _mm_sub_epi16(s[5], s[10]); 227691037db265ecdd914a26e056cf69207b4f50924ehkuang in[11] = _mm_sub_epi16(s[4], s[11]); 227791037db265ecdd914a26e056cf69207b4f50924ehkuang in[12] = _mm_sub_epi16(s[3], s[12]); 227891037db265ecdd914a26e056cf69207b4f50924ehkuang in[13] = _mm_sub_epi16(s[2], s[13]); 227991037db265ecdd914a26e056cf69207b4f50924ehkuang in[14] = _mm_sub_epi16(s[1], s[14]); 228091037db265ecdd914a26e056cf69207b4f50924ehkuang in[15] = _mm_sub_epi16(s[0], s[15]); 228191037db265ecdd914a26e056cf69207b4f50924ehkuang} 228291037db265ecdd914a26e056cf69207b4f50924ehkuang 228391037db265ecdd914a26e056cf69207b4f50924ehkuangvoid idct16_1d_sse2(__m128i *in0, __m128i *in1) { 228491037db265ecdd914a26e056cf69207b4f50924ehkuang array_transpose_16x16(in0, in1); 228591037db265ecdd914a26e056cf69207b4f50924ehkuang idct16_1d_8col(in0); 228691037db265ecdd914a26e056cf69207b4f50924ehkuang idct16_1d_8col(in1); 228791037db265ecdd914a26e056cf69207b4f50924ehkuang} 228891037db265ecdd914a26e056cf69207b4f50924ehkuang 228991037db265ecdd914a26e056cf69207b4f50924ehkuangvoid iadst16_1d_sse2(__m128i *in0, __m128i *in1) { 229091037db265ecdd914a26e056cf69207b4f50924ehkuang array_transpose_16x16(in0, in1); 229191037db265ecdd914a26e056cf69207b4f50924ehkuang iadst16_1d_8col(in0); 229291037db265ecdd914a26e056cf69207b4f50924ehkuang iadst16_1d_8col(in1); 229391037db265ecdd914a26e056cf69207b4f50924ehkuang} 229491037db265ecdd914a26e056cf69207b4f50924ehkuang 229591037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void load_buffer_8x16(int16_t *input, __m128i *in) { 229691037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_load_si128((__m128i *)(input + 0 * 16)); 229791037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_load_si128((__m128i *)(input + 1 * 16)); 229891037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = _mm_load_si128((__m128i *)(input + 2 * 16)); 229991037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_load_si128((__m128i *)(input + 3 * 16)); 230091037db265ecdd914a26e056cf69207b4f50924ehkuang in[4] = _mm_load_si128((__m128i *)(input + 4 * 16)); 230191037db265ecdd914a26e056cf69207b4f50924ehkuang in[5] = _mm_load_si128((__m128i *)(input + 5 * 16)); 230291037db265ecdd914a26e056cf69207b4f50924ehkuang in[6] = _mm_load_si128((__m128i *)(input + 6 * 16)); 230391037db265ecdd914a26e056cf69207b4f50924ehkuang in[7] = _mm_load_si128((__m128i *)(input + 7 * 16)); 230491037db265ecdd914a26e056cf69207b4f50924ehkuang 230591037db265ecdd914a26e056cf69207b4f50924ehkuang in[8] = _mm_load_si128((__m128i *)(input + 8 * 16)); 230691037db265ecdd914a26e056cf69207b4f50924ehkuang in[9] = _mm_load_si128((__m128i *)(input + 9 * 16)); 230791037db265ecdd914a26e056cf69207b4f50924ehkuang in[10] = _mm_load_si128((__m128i *)(input + 10 * 16)); 230891037db265ecdd914a26e056cf69207b4f50924ehkuang in[11] = _mm_load_si128((__m128i *)(input + 11 * 16)); 230991037db265ecdd914a26e056cf69207b4f50924ehkuang in[12] = _mm_load_si128((__m128i *)(input + 12 * 16)); 231091037db265ecdd914a26e056cf69207b4f50924ehkuang in[13] = _mm_load_si128((__m128i *)(input + 13 * 16)); 231191037db265ecdd914a26e056cf69207b4f50924ehkuang in[14] = _mm_load_si128((__m128i *)(input + 14 * 16)); 231291037db265ecdd914a26e056cf69207b4f50924ehkuang in[15] = _mm_load_si128((__m128i *)(input + 15 * 16)); 231391037db265ecdd914a26e056cf69207b4f50924ehkuang} 231491037db265ecdd914a26e056cf69207b4f50924ehkuang 231591037db265ecdd914a26e056cf69207b4f50924ehkuangstatic INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { 231691037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i final_rounding = _mm_set1_epi16(1<<5); 231791037db265ecdd914a26e056cf69207b4f50924ehkuang const __m128i zero = _mm_setzero_si128(); 231891037db265ecdd914a26e056cf69207b4f50924ehkuang // Final rounding and shift 231991037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_adds_epi16(in[0], final_rounding); 232091037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_adds_epi16(in[1], final_rounding); 232191037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = _mm_adds_epi16(in[2], final_rounding); 232291037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_adds_epi16(in[3], final_rounding); 232391037db265ecdd914a26e056cf69207b4f50924ehkuang in[4] = _mm_adds_epi16(in[4], final_rounding); 232491037db265ecdd914a26e056cf69207b4f50924ehkuang in[5] = _mm_adds_epi16(in[5], final_rounding); 232591037db265ecdd914a26e056cf69207b4f50924ehkuang in[6] = _mm_adds_epi16(in[6], final_rounding); 232691037db265ecdd914a26e056cf69207b4f50924ehkuang in[7] = _mm_adds_epi16(in[7], final_rounding); 232791037db265ecdd914a26e056cf69207b4f50924ehkuang in[8] = _mm_adds_epi16(in[8], final_rounding); 232891037db265ecdd914a26e056cf69207b4f50924ehkuang in[9] = _mm_adds_epi16(in[9], final_rounding); 232991037db265ecdd914a26e056cf69207b4f50924ehkuang in[10] = _mm_adds_epi16(in[10], final_rounding); 233091037db265ecdd914a26e056cf69207b4f50924ehkuang in[11] = _mm_adds_epi16(in[11], final_rounding); 233191037db265ecdd914a26e056cf69207b4f50924ehkuang in[12] = _mm_adds_epi16(in[12], final_rounding); 233291037db265ecdd914a26e056cf69207b4f50924ehkuang in[13] = _mm_adds_epi16(in[13], final_rounding); 233391037db265ecdd914a26e056cf69207b4f50924ehkuang in[14] = _mm_adds_epi16(in[14], final_rounding); 233491037db265ecdd914a26e056cf69207b4f50924ehkuang in[15] = _mm_adds_epi16(in[15], final_rounding); 233591037db265ecdd914a26e056cf69207b4f50924ehkuang 233691037db265ecdd914a26e056cf69207b4f50924ehkuang in[0] = _mm_srai_epi16(in[0], 6); 233791037db265ecdd914a26e056cf69207b4f50924ehkuang in[1] = _mm_srai_epi16(in[1], 6); 233891037db265ecdd914a26e056cf69207b4f50924ehkuang in[2] = _mm_srai_epi16(in[2], 6); 233991037db265ecdd914a26e056cf69207b4f50924ehkuang in[3] = _mm_srai_epi16(in[3], 6); 234091037db265ecdd914a26e056cf69207b4f50924ehkuang in[4] = _mm_srai_epi16(in[4], 6); 234191037db265ecdd914a26e056cf69207b4f50924ehkuang in[5] = _mm_srai_epi16(in[5], 6); 234291037db265ecdd914a26e056cf69207b4f50924ehkuang in[6] = _mm_srai_epi16(in[6], 6); 234391037db265ecdd914a26e056cf69207b4f50924ehkuang in[7] = _mm_srai_epi16(in[7], 6); 234491037db265ecdd914a26e056cf69207b4f50924ehkuang in[8] = _mm_srai_epi16(in[8], 6); 234591037db265ecdd914a26e056cf69207b4f50924ehkuang in[9] = _mm_srai_epi16(in[9], 6); 234691037db265ecdd914a26e056cf69207b4f50924ehkuang in[10] = _mm_srai_epi16(in[10], 6); 234791037db265ecdd914a26e056cf69207b4f50924ehkuang in[11] = _mm_srai_epi16(in[11], 6); 234891037db265ecdd914a26e056cf69207b4f50924ehkuang in[12] = _mm_srai_epi16(in[12], 6); 234991037db265ecdd914a26e056cf69207b4f50924ehkuang in[13] = _mm_srai_epi16(in[13], 6); 235091037db265ecdd914a26e056cf69207b4f50924ehkuang in[14] = _mm_srai_epi16(in[14], 6); 235191037db265ecdd914a26e056cf69207b4f50924ehkuang in[15] = _mm_srai_epi16(in[15], 6); 235291037db265ecdd914a26e056cf69207b4f50924ehkuang 235391037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[0]); 235491037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[1]); 235591037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[2]); 235691037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[3]); 235791037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[4]); 235891037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[5]); 235991037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[6]); 236091037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[7]); 236191037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[8]); 236291037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[9]); 236391037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[10]); 236491037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[11]); 236591037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[12]); 236691037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[13]); 236791037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[14]); 236891037db265ecdd914a26e056cf69207b4f50924ehkuang RECON_AND_STORE(dest, in[15]); 236991037db265ecdd914a26e056cf69207b4f50924ehkuang} 237091037db265ecdd914a26e056cf69207b4f50924ehkuang 237191037db265ecdd914a26e056cf69207b4f50924ehkuangvoid vp9_short_iht16x16_add_sse2(int16_t *input, uint8_t *dest, int stride, 237291037db265ecdd914a26e056cf69207b4f50924ehkuang int tx_type) { 237391037db265ecdd914a26e056cf69207b4f50924ehkuang __m128i in0[16], in1[16]; 237491037db265ecdd914a26e056cf69207b4f50924ehkuang 237591037db265ecdd914a26e056cf69207b4f50924ehkuang load_buffer_8x16(input, in0); 237691037db265ecdd914a26e056cf69207b4f50924ehkuang input += 8; 237791037db265ecdd914a26e056cf69207b4f50924ehkuang load_buffer_8x16(input, in1); 237891037db265ecdd914a26e056cf69207b4f50924ehkuang 237991037db265ecdd914a26e056cf69207b4f50924ehkuang switch (tx_type) { 238091037db265ecdd914a26e056cf69207b4f50924ehkuang case 0: // DCT_DCT 238191037db265ecdd914a26e056cf69207b4f50924ehkuang idct16_1d_sse2(in0, in1); 238291037db265ecdd914a26e056cf69207b4f50924ehkuang idct16_1d_sse2(in0, in1); 238391037db265ecdd914a26e056cf69207b4f50924ehkuang break; 238491037db265ecdd914a26e056cf69207b4f50924ehkuang case 1: // ADST_DCT 238591037db265ecdd914a26e056cf69207b4f50924ehkuang idct16_1d_sse2(in0, in1); 238691037db265ecdd914a26e056cf69207b4f50924ehkuang iadst16_1d_sse2(in0, in1); 238791037db265ecdd914a26e056cf69207b4f50924ehkuang break; 238891037db265ecdd914a26e056cf69207b4f50924ehkuang case 2: // DCT_ADST 238991037db265ecdd914a26e056cf69207b4f50924ehkuang iadst16_1d_sse2(in0, in1); 239091037db265ecdd914a26e056cf69207b4f50924ehkuang idct16_1d_sse2(in0, in1); 239191037db265ecdd914a26e056cf69207b4f50924ehkuang break; 239291037db265ecdd914a26e056cf69207b4f50924ehkuang case 3: // ADST_ADST 239391037db265ecdd914a26e056cf69207b4f50924ehkuang iadst16_1d_sse2(in0, in1); 239491037db265ecdd914a26e056cf69207b4f50924ehkuang iadst16_1d_sse2(in0, in1); 239591037db265ecdd914a26e056cf69207b4f50924ehkuang break; 239691037db265ecdd914a26e056cf69207b4f50924ehkuang default: 239791037db265ecdd914a26e056cf69207b4f50924ehkuang assert(0); 239891037db265ecdd914a26e056cf69207b4f50924ehkuang break; 239991037db265ecdd914a26e056cf69207b4f50924ehkuang } 240091037db265ecdd914a26e056cf69207b4f50924ehkuang 240191037db265ecdd914a26e056cf69207b4f50924ehkuang write_buffer_8x16(dest, in0, stride); 240291037db265ecdd914a26e056cf69207b4f50924ehkuang dest += 8; 240391037db265ecdd914a26e056cf69207b4f50924ehkuang write_buffer_8x16(dest, in1, stride); 240491037db265ecdd914a26e056cf69207b4f50924ehkuang} 240591037db265ecdd914a26e056cf69207b4f50924ehkuang 2406ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangvoid vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest, 2407ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang int stride) { 2408ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 2409ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i final_rounding = _mm_set1_epi16(1<<5); 2410ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i zero = _mm_setzero_si128(); 2411ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2412ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 2413ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 2414ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 2415ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 2416ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 2417ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 2418ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 2419ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 2420ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2421ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 2422ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 2423ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 2424ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 2425ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2426ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 2427ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 2428ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 2429ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 2430ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 2431ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 2432ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 2433ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); 2434ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2435ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 2436ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2437ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero, 2438ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero, 2439ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in10 = zero, in11 = zero, in12 = zero, in13 = zero, 2440ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in14 = zero, in15 = zero; 2441ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero, 2442ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero, 2443ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l12 = zero, l13 = zero, l14 = zero, l15 = zero; 2444ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2445ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 2446ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 2447ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_8_0, stp1_12_0; 2448ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 2449ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; 2450ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 2451ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang int i; 2452ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 1-D idct. Load input data. 2453ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_load_si128((__m128i *)input); 2454ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in8 = _mm_load_si128((__m128i *)(input + 8 * 1)); 2455ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_load_si128((__m128i *)(input + 8 * 2)); 2456ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in9 = _mm_load_si128((__m128i *)(input + 8 * 3)); 2457ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_load_si128((__m128i *)(input + 8 * 4)); 2458ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in10 = _mm_load_si128((__m128i *)(input + 8 * 5)); 2459ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_load_si128((__m128i *)(input + 8 * 6)); 2460ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in11 = _mm_load_si128((__m128i *)(input + 8 * 7)); 2461ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2462ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3); 2463ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11); 2464ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2465ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage2 2466ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 2467ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_1_15 = _mm_unpackhi_epi16(in0, in11); 2468ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_9_7 = _mm_unpackhi_epi16(in8, in3); 2469ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_5_11 = _mm_unpackhi_epi16(in2, in9); 2470ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_13_3 = _mm_unpackhi_epi16(in10, in1); 2471ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2472ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); 2473ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); 2474ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_madd_epi16(lo_9_7, stg2_2); 2475ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_madd_epi16(lo_9_7, stg2_3); 2476ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_madd_epi16(lo_5_11, stg2_4); 2477ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_madd_epi16(lo_5_11, stg2_5); 2478ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); 2479ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); 2480ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2481ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_add_epi32(tmp0, rounding); 2482ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_add_epi32(tmp2, rounding); 2483ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_add_epi32(tmp4, rounding); 2484ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_add_epi32(tmp6, rounding); 2485ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_add_epi32(tmp1, rounding); 2486ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_add_epi32(tmp3, rounding); 2487ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp5 = _mm_add_epi32(tmp5, rounding); 2488ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp7 = _mm_add_epi32(tmp7, rounding); 2489ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2490ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2491ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2492ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 2493ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 2494ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); 2495ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 2496ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); 2497ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); 2498ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2499ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_8 = _mm_packs_epi32(tmp0, zero); 2500ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_15 = _mm_packs_epi32(tmp2, zero); 2501ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_9 = _mm_packs_epi32(tmp4, zero); 2502ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_14 = _mm_packs_epi32(tmp6, zero); 2503ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2504ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_10 = _mm_packs_epi32(tmp1, zero); 2505ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_13 = _mm_packs_epi32(tmp3, zero); 2506ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_11 = _mm_packs_epi32(tmp5, zero); 2507ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_12 = _mm_packs_epi32(tmp7, zero); 2508ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 2509ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2510ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage3 2511ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 2512ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_2_14 = _mm_unpacklo_epi16(in1, in11); 2513ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_10_6 = _mm_unpacklo_epi16(in9, in3); 2514ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2515ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); 2516ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); 2517ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_madd_epi16(lo_10_6, stg3_2); 2518ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_madd_epi16(lo_10_6, stg3_3); 2519ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2520ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_add_epi32(tmp0, rounding); 2521ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_add_epi32(tmp2, rounding); 2522ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_add_epi32(tmp4, rounding); 2523ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_add_epi32(tmp6, rounding); 2524ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2525ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2526ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2527ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 2528ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 2529ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2530ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_4 = _mm_packs_epi32(tmp0, zero); 2531ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_7 = _mm_packs_epi32(tmp2, zero); 2532ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_5 = _mm_packs_epi32(tmp4, zero); 2533ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_6 = _mm_packs_epi32(tmp6, zero); 2534ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2535ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); 2536ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); 2537ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); 2538ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_11 = _mm_add_epi16(stp2_11, stp2_10); 2539ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2540ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); 2541ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); 2542ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); 2543ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_15 = _mm_add_epi16(stp2_15, stp2_14); 2544ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 2545ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2546ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage4 2547ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 2548ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); 2549ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_4_12 = _mm_unpacklo_epi16(in2, in10); 2550ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); 2551ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); 2552ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2553ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); 2554ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); 2555ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_madd_epi16(lo_4_12, stg4_2); 2556ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_madd_epi16(lo_4_12, stg4_3); 2557ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); 2558ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); 2559ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); 2560ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp7 = _mm_madd_epi16(lo_10_13, stg4_7); 2561ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2562ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_add_epi32(tmp0, rounding); 2563ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_add_epi32(tmp2, rounding); 2564ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_add_epi32(tmp4, rounding); 2565ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_add_epi32(tmp6, rounding); 2566ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_add_epi32(tmp1, rounding); 2567ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_add_epi32(tmp3, rounding); 2568ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp5 = _mm_add_epi32(tmp5, rounding); 2569ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp7 = _mm_add_epi32(tmp7, rounding); 2570ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2571ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2572ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2573ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 2574ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 2575ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); 2576ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 2577ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); 2578ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); 2579ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2580ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_0 = _mm_packs_epi32(tmp0, zero); 2581ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_1 = _mm_packs_epi32(tmp2, zero); 2582ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_2 = _mm_packs_epi32(tmp4, zero); 2583ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_3 = _mm_packs_epi32(tmp6, zero); 2584ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_9 = _mm_packs_epi32(tmp1, zero); 2585ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_14 = _mm_packs_epi32(tmp3, zero); 2586ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_10 = _mm_packs_epi32(tmp5, zero); 2587ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_13 = _mm_packs_epi32(tmp7, zero); 2588ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2589ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_4 = _mm_add_epi16(stp1_4, stp1_5); 2590ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); 2591ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); 2592ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_7 = _mm_add_epi16(stp1_7, stp1_6); 2593ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 2594ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2595ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage5 and Stage6 2596ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 2597ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_0 = _mm_add_epi16(stp2_0, stp2_3); 2598ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_1 = _mm_add_epi16(stp2_1, stp2_2); 2599ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); 2600ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); 2601ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2602ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); 2603ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_9 = _mm_add_epi16(stp2_9, stp2_10); 2604ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); 2605ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); 2606ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2607ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); 2608ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); 2609ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_14 = _mm_add_epi16(stp2_14, stp2_13); 2610ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); 2611ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 2612ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2613ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage6 2614ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 2615ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); 2616ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); 2617ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); 2618ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2619ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_madd_epi16(lo_6_5, stg4_1); 2620ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_madd_epi16(lo_6_5, stg4_0); 2621ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); 2622ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); 2623ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); 2624ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); 2625ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2626ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_add_epi32(tmp1, rounding); 2627ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_add_epi32(tmp3, rounding); 2628ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_add_epi32(tmp0, rounding); 2629ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_add_epi32(tmp2, rounding); 2630ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_add_epi32(tmp4, rounding); 2631ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_add_epi32(tmp6, rounding); 2632ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2633ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); 2634ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 2635ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2636ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2637ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 2638ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 2639ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2640ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_5 = _mm_packs_epi32(tmp1, zero); 2641ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_6 = _mm_packs_epi32(tmp3, zero); 2642ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_10 = _mm_packs_epi32(tmp0, zero); 2643ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_13 = _mm_packs_epi32(tmp2, zero); 2644ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_11 = _mm_packs_epi32(tmp4, zero); 2645ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_12 = _mm_packs_epi32(tmp6, zero); 2646ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2647ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_0 = _mm_add_epi16(stp1_0, stp2_7); 2648ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_1 = _mm_add_epi16(stp1_1, stp1_6); 2649ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_2 = _mm_add_epi16(stp1_2, stp1_5); 2650ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_3 = _mm_add_epi16(stp1_3, stp2_4); 2651ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); 2652ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); 2653ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); 2654ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); 2655ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 2656ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2657ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage7. Left 8x16 only. 2658ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l0 = _mm_add_epi16(stp2_0, stp1_15); 2659ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l1 = _mm_add_epi16(stp2_1, stp1_14); 2660ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l2 = _mm_add_epi16(stp2_2, stp2_13); 2661ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l3 = _mm_add_epi16(stp2_3, stp2_12); 2662ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l4 = _mm_add_epi16(stp2_4, stp2_11); 2663ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l5 = _mm_add_epi16(stp2_5, stp2_10); 2664ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l6 = _mm_add_epi16(stp2_6, stp1_9); 2665ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l7 = _mm_add_epi16(stp2_7, stp1_8); 2666ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l8 = _mm_sub_epi16(stp2_7, stp1_8); 2667ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l9 = _mm_sub_epi16(stp2_6, stp1_9); 2668ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l10 = _mm_sub_epi16(stp2_5, stp2_10); 2669ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l11 = _mm_sub_epi16(stp2_4, stp2_11); 2670ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l12 = _mm_sub_epi16(stp2_3, stp2_12); 2671ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l13 = _mm_sub_epi16(stp2_2, stp2_13); 2672ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l14 = _mm_sub_epi16(stp2_1, stp1_14); 2673ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang l15 = _mm_sub_epi16(stp2_0, stp1_15); 2674ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2675ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 2-D idct. We do 2 8x16 blocks. 2676ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang for (i = 0; i < 2; i++) { 2677ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang if (i == 0) 2678ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang TRANSPOSE_4X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4, 2679ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in5, in6, in7); 2680ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2681ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang if (i == 1) 2682ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang TRANSPOSE_4X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3, 2683ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4, in5, in6, in7); 2684ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2685ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero; 2686ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2687ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang IDCT16x16_1D 2688ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2689ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage7 2690ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_add_epi16(stp2_0, stp1_15); 2691ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_add_epi16(stp2_1, stp1_14); 2692ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_add_epi16(stp2_2, stp2_13); 2693ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_add_epi16(stp2_3, stp2_12); 2694ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4 = _mm_add_epi16(stp2_4, stp2_11); 2695ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in5 = _mm_add_epi16(stp2_5, stp2_10); 2696ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in6 = _mm_add_epi16(stp2_6, stp1_9); 2697ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in7 = _mm_add_epi16(stp2_7, stp1_8); 2698ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in8 = _mm_sub_epi16(stp2_7, stp1_8); 2699ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in9 = _mm_sub_epi16(stp2_6, stp1_9); 2700ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in10 = _mm_sub_epi16(stp2_5, stp2_10); 2701ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in11 = _mm_sub_epi16(stp2_4, stp2_11); 2702ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in12 = _mm_sub_epi16(stp2_3, stp2_12); 2703ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in13 = _mm_sub_epi16(stp2_2, stp2_13); 2704ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in14 = _mm_sub_epi16(stp2_1, stp1_14); 2705ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in15 = _mm_sub_epi16(stp2_0, stp1_15); 2706ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2707ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Final rounding and shift 2708ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_adds_epi16(in0, final_rounding); 2709ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_adds_epi16(in1, final_rounding); 2710ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_adds_epi16(in2, final_rounding); 2711ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_adds_epi16(in3, final_rounding); 2712ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4 = _mm_adds_epi16(in4, final_rounding); 2713ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in5 = _mm_adds_epi16(in5, final_rounding); 2714ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in6 = _mm_adds_epi16(in6, final_rounding); 2715ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in7 = _mm_adds_epi16(in7, final_rounding); 2716ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in8 = _mm_adds_epi16(in8, final_rounding); 2717ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in9 = _mm_adds_epi16(in9, final_rounding); 2718ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in10 = _mm_adds_epi16(in10, final_rounding); 2719ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in11 = _mm_adds_epi16(in11, final_rounding); 2720ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in12 = _mm_adds_epi16(in12, final_rounding); 2721ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in13 = _mm_adds_epi16(in13, final_rounding); 2722ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in14 = _mm_adds_epi16(in14, final_rounding); 2723ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in15 = _mm_adds_epi16(in15, final_rounding); 2724ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2725ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_srai_epi16(in0, 6); 2726ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_srai_epi16(in1, 6); 2727ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_srai_epi16(in2, 6); 2728ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_srai_epi16(in3, 6); 2729ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4 = _mm_srai_epi16(in4, 6); 2730ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in5 = _mm_srai_epi16(in5, 6); 2731ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in6 = _mm_srai_epi16(in6, 6); 2732ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in7 = _mm_srai_epi16(in7, 6); 2733ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in8 = _mm_srai_epi16(in8, 6); 2734ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in9 = _mm_srai_epi16(in9, 6); 2735ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in10 = _mm_srai_epi16(in10, 6); 2736ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in11 = _mm_srai_epi16(in11, 6); 2737ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in12 = _mm_srai_epi16(in12, 6); 2738ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in13 = _mm_srai_epi16(in13, 6); 2739ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in14 = _mm_srai_epi16(in14, 6); 2740ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in15 = _mm_srai_epi16(in15, 6); 2741ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2742ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in0); 2743ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in1); 2744ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in2); 2745ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in3); 2746ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in4); 2747ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in5); 2748ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in6); 2749ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in7); 2750ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in8); 2751ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in9); 2752ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in10); 2753ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in11); 2754ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in12); 2755ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in13); 2756ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in14); 2757ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in15); 2758ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2759ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang dest += 8 - (stride * 16); 2760ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 2761ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang} 2762ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2763ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangvoid vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) { 2764ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 2765ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i final_rounding = _mm_set1_epi16(1<<5); 2766ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2767ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // idct constants for each stage 2768ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); 2769ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); 2770ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); 2771ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); 2772ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); 2773ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); 2774ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); 2775ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); 2776ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); 2777ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); 2778ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); 2779ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); 2780ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); 2781ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); 2782ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); 2783ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); 2784ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2785ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 2786ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 2787ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 2788ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 2789ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 2790ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 2791ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 2792ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 2793ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2794ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 2795ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 2796ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 2797ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 2798ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); 2799ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); 2800ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); 2801ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); 2802ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); 2803ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); 2804ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2805ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 2806ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 2807ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 2808ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 2809ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 2810ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 2811ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 2812ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2813ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 2814ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2815ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, 2816ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23, 2817ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in24, in25, in26, in27, in28, in29, in30, in31; 2818ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i col[128]; 2819ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 2820ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 2821ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, 2822ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, 2823ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_30, stp1_31; 2824ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 2825ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, 2826ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, 2827ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, 2828ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_30, stp2_31; 2829ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 2830ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang int i, j; 2831ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2832ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. 2833ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang for (i = 0; i < 8; i++) { 2834ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang if (i < 4) { 2835ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // First 1-D idct 2836ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Load input data. 2837ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_load_si128((__m128i *)input); 2838ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in8 = _mm_load_si128((__m128i *)(input + 8 * 1)); 2839ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in16 = _mm_load_si128((__m128i *)(input + 8 * 2)); 2840ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in24 = _mm_load_si128((__m128i *)(input + 8 * 3)); 2841ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_load_si128((__m128i *)(input + 8 * 4)); 2842ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in9 = _mm_load_si128((__m128i *)(input + 8 * 5)); 2843ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in17 = _mm_load_si128((__m128i *)(input + 8 * 6)); 2844ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in25 = _mm_load_si128((__m128i *)(input + 8 * 7)); 2845ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_load_si128((__m128i *)(input + 8 * 8)); 2846ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in10 = _mm_load_si128((__m128i *)(input + 8 * 9)); 2847ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in18 = _mm_load_si128((__m128i *)(input + 8 * 10)); 2848ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in26 = _mm_load_si128((__m128i *)(input + 8 * 11)); 2849ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_load_si128((__m128i *)(input + 8 * 12)); 2850ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in11 = _mm_load_si128((__m128i *)(input + 8 * 13)); 2851ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in19 = _mm_load_si128((__m128i *)(input + 8 * 14)); 2852ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in27 = _mm_load_si128((__m128i *)(input + 8 * 15)); 2853ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2854ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4 = _mm_load_si128((__m128i *)(input + 8 * 16)); 2855ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in12 = _mm_load_si128((__m128i *)(input + 8 * 17)); 2856ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in20 = _mm_load_si128((__m128i *)(input + 8 * 18)); 2857ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in28 = _mm_load_si128((__m128i *)(input + 8 * 19)); 2858ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in5 = _mm_load_si128((__m128i *)(input + 8 * 20)); 2859ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in13 = _mm_load_si128((__m128i *)(input + 8 * 21)); 2860ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in21 = _mm_load_si128((__m128i *)(input + 8 * 22)); 2861ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in29 = _mm_load_si128((__m128i *)(input + 8 * 23)); 2862ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in6 = _mm_load_si128((__m128i *)(input + 8 * 24)); 2863ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in14 = _mm_load_si128((__m128i *)(input + 8 * 25)); 2864ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in22 = _mm_load_si128((__m128i *)(input + 8 * 26)); 2865ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in30 = _mm_load_si128((__m128i *)(input + 8 * 27)); 2866ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in7 = _mm_load_si128((__m128i *)(input + 8 * 28)); 2867ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in15 = _mm_load_si128((__m128i *)(input + 8 * 29)); 2868ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in23 = _mm_load_si128((__m128i *)(input + 8 * 30)); 2869ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in31 = _mm_load_si128((__m128i *)(input + 8 * 31)); 2870ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2871ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang input += 256; 2872ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2873ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Transpose 32x8 block to 8x32 block 2874ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 2875ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4, in5, in6, in7); 2876ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, 2877ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in10, in11, in12, in13, in14, in15); 2878ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17, 2879ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in18, in19, in20, in21, in22, in23); 2880ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25, 2881ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in26, in27, in28, in29, in30, in31); 2882ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } else { 2883ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Second 1-D idct 2884ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang j = i - 4; 2885ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2886ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Transpose 32x8 block to 8x32 block 2887ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 2888ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 2889ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4, 2890ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in5, in6, in7); 2891ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang j += 4; 2892ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 2893ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 2894ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10, 2895ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in11, in12, in13, in14, in15); 2896ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang j += 4; 2897ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 2898ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 2899ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18, 2900ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in19, in20, in21, in22, in23); 2901ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang j += 4; 2902ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 2903ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 2904ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27, 2905ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in28, in29, in30, in31); 2906ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 2907ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2908ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage1 2909ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 2910ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); 2911ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); 2912ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); 2913ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); 2914ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2915ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); 2916ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); 2917ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); 2918ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); 2919ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2920ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); 2921ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); 2922ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); 2923ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); 2924ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2925ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); 2926ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); 2927ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); 2928ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); 2929ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2930ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, 2931ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, 2932ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_17, stp1_30) 2933ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, 2934ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, 2935ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_19, stp1_28) 2936ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, 2937ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, 2938ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_21, stp1_26) 2939ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, 2940ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, 2941ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_23, stp1_24) 2942ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 2943ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2944ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage2 2945ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 2946ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); 2947ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); 2948ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); 2949ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); 2950ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2951ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); 2952ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); 2953ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); 2954ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); 2955ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2956ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, 2957ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, 2958ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_14) 2959ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, 2960ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, 2961ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_11, stp2_12) 2962ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2963ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_16 = _mm_add_epi16(stp1_16, stp1_17); 2964ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); 2965ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); 2966ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_19 = _mm_add_epi16(stp1_19, stp1_18); 2967ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2968ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_20 = _mm_add_epi16(stp1_20, stp1_21); 2969ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); 2970ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); 2971ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_23 = _mm_add_epi16(stp1_23, stp1_22); 2972ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2973ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_24 = _mm_add_epi16(stp1_24, stp1_25); 2974ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); 2975ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); 2976ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_27 = _mm_add_epi16(stp1_27, stp1_26); 2977ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2978ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_28 = _mm_add_epi16(stp1_28, stp1_29); 2979ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); 2980ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); 2981ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_31 = _mm_add_epi16(stp1_31, stp1_30); 2982ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 2983ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2984ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage3 2985ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 2986ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); 2987ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); 2988ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); 2989ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); 2990ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2991ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); 2992ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); 2993ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); 2994ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); 2995ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 2996ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); 2997ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); 2998ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); 2999ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); 3000ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3001ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, 3002ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, 3003ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_6) 3004ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3005ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_8 = _mm_add_epi16(stp2_8, stp2_9); 3006ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); 3007ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); 3008ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_11 = _mm_add_epi16(stp2_11, stp2_10); 3009ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_12 = _mm_add_epi16(stp2_12, stp2_13); 3010ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); 3011ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); 3012ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_15 = _mm_add_epi16(stp2_15, stp2_14); 3013ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3014ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, 3015ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, 3016ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_18, stp1_29) 3017ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, 3018ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, 3019ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_22, stp1_25) 3020ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3021ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_16 = stp2_16; 3022ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_31 = stp2_31; 3023ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_19 = stp2_19; 3024ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_20 = stp2_20; 3025ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_23 = stp2_23; 3026ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_24 = stp2_24; 3027ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_27 = stp2_27; 3028ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_28 = stp2_28; 3029ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 3030ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3031ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage4 3032ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 3033ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); 3034ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); 3035ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); 3036ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); 3037ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3038ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); 3039ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); 3040ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); 3041ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); 3042ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3043ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, 3044ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, 3045ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_2, stp2_3) 3046ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3047ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_4 = _mm_add_epi16(stp1_4, stp1_5); 3048ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); 3049ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); 3050ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_7 = _mm_add_epi16(stp1_7, stp1_6); 3051ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3052ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, 3053ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, 3054ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_10, stp2_13) 3055ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3056ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_8 = stp1_8; 3057ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_15 = stp1_15; 3058ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_11 = stp1_11; 3059ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_12 = stp1_12; 3060ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3061ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_16 = _mm_add_epi16(stp1_16, stp1_19); 3062ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_17 = _mm_add_epi16(stp1_17, stp1_18); 3063ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); 3064ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); 3065ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); 3066ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); 3067ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_22 = _mm_add_epi16(stp1_22, stp1_21); 3068ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_23 = _mm_add_epi16(stp1_23, stp1_20); 3069ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3070ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_24 = _mm_add_epi16(stp1_24, stp1_27); 3071ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_25 = _mm_add_epi16(stp1_25, stp1_26); 3072ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); 3073ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); 3074ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); 3075ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); 3076ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_30 = _mm_add_epi16(stp1_29, stp1_30); 3077ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_31 = _mm_add_epi16(stp1_28, stp1_31); 3078ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 3079ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3080ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage5 3081ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 3082ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); 3083ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); 3084ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); 3085ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); 3086ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3087ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); 3088ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); 3089ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); 3090ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); 3091ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3092ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); 3093ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); 3094ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3095ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_0 = _mm_add_epi16(stp2_0, stp2_3); 3096ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_1 = _mm_add_epi16(stp2_1, stp2_2); 3097ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); 3098ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); 3099ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); 3101ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); 3102ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); 3103ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); 3104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3105ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_add_epi32(tmp0, rounding); 3106ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_add_epi32(tmp1, rounding); 3107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_add_epi32(tmp2, rounding); 3108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_add_epi32(tmp3, rounding); 3109ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3110ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 3111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); 3112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 3113ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 3114ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3115ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_5 = _mm_packs_epi32(tmp0, tmp1); 3116ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_6 = _mm_packs_epi32(tmp2, tmp3); 3117ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3118ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_4 = stp2_4; 3119ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_7 = stp2_7; 3120ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3121ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_8 = _mm_add_epi16(stp2_8, stp2_11); 3122ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_9 = _mm_add_epi16(stp2_9, stp2_10); 3123ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); 3124ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); 3125ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); 3126ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); 3127ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_14 = _mm_add_epi16(stp2_14, stp2_13); 3128ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_15 = _mm_add_epi16(stp2_15, stp2_12); 3129ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3130ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_16 = stp2_16; 3131ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_17 = stp2_17; 3132ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3133ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, 3134ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, 3135ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_19, stp1_28) 3136ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, 3137ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, 3138ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_21, stp1_26) 3139ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3140ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_22 = stp2_22; 3141ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_23 = stp2_23; 3142ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_24 = stp2_24; 3143ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_25 = stp2_25; 3144ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_30 = stp2_30; 3145ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_31 = stp2_31; 3146ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 3147ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3148ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage6 3149ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 3150ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); 3151ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); 3152ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); 3153ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); 3154ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3155ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_0 = _mm_add_epi16(stp1_0, stp1_7); 3156ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_1 = _mm_add_epi16(stp1_1, stp1_6); 3157ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_2 = _mm_add_epi16(stp1_2, stp1_5); 3158ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_3 = _mm_add_epi16(stp1_3, stp1_4); 3159ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); 3160ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); 3161ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); 3162ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); 3163ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3164ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_8 = stp1_8; 3165ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_9 = stp1_9; 3166ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_14 = stp1_14; 3167ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_15 = stp1_15; 3168ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3169ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, 3170ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, 3171ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_13, stp2_11, stp2_12) 3172ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3173ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_16 = _mm_add_epi16(stp1_16, stp1_23); 3174ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_17 = _mm_add_epi16(stp1_17, stp1_22); 3175ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_18 = _mm_add_epi16(stp1_18, stp1_21); 3176ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_19 = _mm_add_epi16(stp1_19, stp1_20); 3177ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); 3178ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); 3179ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); 3180ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); 3181ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3182ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); 3183ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); 3184ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); 3185ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); 3186ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_28 = _mm_add_epi16(stp1_27, stp1_28); 3187ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_29 = _mm_add_epi16(stp1_26, stp1_29); 3188ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_30 = _mm_add_epi16(stp1_25, stp1_30); 3189ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp2_31 = _mm_add_epi16(stp1_24, stp1_31); 3190ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 3191ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3192ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Stage7 3193ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang { 3194ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); 3195ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); 3196ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); 3197ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); 3198ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3199ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); 3200ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); 3201ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); 3202ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); 3203ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3204ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_0 = _mm_add_epi16(stp2_0, stp2_15); 3205ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_1 = _mm_add_epi16(stp2_1, stp2_14); 3206ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_2 = _mm_add_epi16(stp2_2, stp2_13); 3207ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_3 = _mm_add_epi16(stp2_3, stp2_12); 3208ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_4 = _mm_add_epi16(stp2_4, stp2_11); 3209ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_5 = _mm_add_epi16(stp2_5, stp2_10); 3210ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_6 = _mm_add_epi16(stp2_6, stp2_9); 3211ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_7 = _mm_add_epi16(stp2_7, stp2_8); 3212ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); 3213ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); 3214ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); 3215ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); 3216ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); 3217ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); 3218ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); 3219ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); 3220ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3221ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_16 = stp2_16; 3222ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_17 = stp2_17; 3223ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_18 = stp2_18; 3224ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_19 = stp2_19; 3225ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3226ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, 3227ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, 3228ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_21, stp1_26) 3229ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, 3230ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, 3231ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_23, stp1_24) 3232ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3233ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_28 = stp2_28; 3234ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_29 = stp2_29; 3235ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_30 = stp2_30; 3236ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang stp1_31 = stp2_31; 3237ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 3238ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3239ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // final stage 3240ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang if (i < 4) { 3241ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 1_D: Store 32 intermediate results for each 8x32 block. 3242ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 0] = _mm_add_epi16(stp1_0, stp1_31); 3243ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 1] = _mm_add_epi16(stp1_1, stp1_30); 3244ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 2] = _mm_add_epi16(stp1_2, stp1_29); 3245ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 3] = _mm_add_epi16(stp1_3, stp1_28); 3246ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 4] = _mm_add_epi16(stp1_4, stp1_27); 3247ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 5] = _mm_add_epi16(stp1_5, stp1_26); 3248ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 6] = _mm_add_epi16(stp1_6, stp1_25); 3249ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 7] = _mm_add_epi16(stp1_7, stp1_24); 3250ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 8] = _mm_add_epi16(stp1_8, stp1_23); 3251ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 9] = _mm_add_epi16(stp1_9, stp1_22); 3252ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 10] = _mm_add_epi16(stp1_10, stp1_21); 3253ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 11] = _mm_add_epi16(stp1_11, stp1_20); 3254ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 12] = _mm_add_epi16(stp1_12, stp1_19); 3255ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 13] = _mm_add_epi16(stp1_13, stp1_18); 3256ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 14] = _mm_add_epi16(stp1_14, stp1_17); 3257ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 15] = _mm_add_epi16(stp1_15, stp1_16); 3258ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); 3259ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); 3260ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); 3261ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); 3262ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); 3263ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); 3264ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); 3265ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); 3266ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); 3267ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); 3268ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); 3269ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); 3270ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); 3271ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); 3272ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); 3273ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang col[i * 32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); 3274ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } else { 3275ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang const __m128i zero = _mm_setzero_si128(); 3276ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3277ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // 2_D: Calculate the results and store them to destination. 3278ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_add_epi16(stp1_0, stp1_31); 3279ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_add_epi16(stp1_1, stp1_30); 3280ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_add_epi16(stp1_2, stp1_29); 3281ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_add_epi16(stp1_3, stp1_28); 3282ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4 = _mm_add_epi16(stp1_4, stp1_27); 3283ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in5 = _mm_add_epi16(stp1_5, stp1_26); 3284ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in6 = _mm_add_epi16(stp1_6, stp1_25); 3285ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in7 = _mm_add_epi16(stp1_7, stp1_24); 3286ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in8 = _mm_add_epi16(stp1_8, stp1_23); 3287ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in9 = _mm_add_epi16(stp1_9, stp1_22); 3288ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in10 = _mm_add_epi16(stp1_10, stp1_21); 3289ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in11 = _mm_add_epi16(stp1_11, stp1_20); 3290ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in12 = _mm_add_epi16(stp1_12, stp1_19); 3291ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in13 = _mm_add_epi16(stp1_13, stp1_18); 3292ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in14 = _mm_add_epi16(stp1_14, stp1_17); 3293ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in15 = _mm_add_epi16(stp1_15, stp1_16); 3294ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in16 = _mm_sub_epi16(stp1_15, stp1_16); 3295ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in17 = _mm_sub_epi16(stp1_14, stp1_17); 3296ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in18 = _mm_sub_epi16(stp1_13, stp1_18); 3297ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in19 = _mm_sub_epi16(stp1_12, stp1_19); 3298ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in20 = _mm_sub_epi16(stp1_11, stp1_20); 3299ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in21 = _mm_sub_epi16(stp1_10, stp1_21); 3300ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in22 = _mm_sub_epi16(stp1_9, stp1_22); 3301ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in23 = _mm_sub_epi16(stp1_8, stp1_23); 3302ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in24 = _mm_sub_epi16(stp1_7, stp1_24); 3303ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in25 = _mm_sub_epi16(stp1_6, stp1_25); 3304ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in26 = _mm_sub_epi16(stp1_5, stp1_26); 3305ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in27 = _mm_sub_epi16(stp1_4, stp1_27); 3306ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in28 = _mm_sub_epi16(stp1_3, stp1_28); 3307ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in29 = _mm_sub_epi16(stp1_2, stp1_29); 3308ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in30 = _mm_sub_epi16(stp1_1, stp1_30); 3309ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in31 = _mm_sub_epi16(stp1_0, stp1_31); 3310ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3311ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang // Final rounding and shift 3312ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_adds_epi16(in0, final_rounding); 3313ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_adds_epi16(in1, final_rounding); 3314ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_adds_epi16(in2, final_rounding); 3315ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_adds_epi16(in3, final_rounding); 3316ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4 = _mm_adds_epi16(in4, final_rounding); 3317ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in5 = _mm_adds_epi16(in5, final_rounding); 3318ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in6 = _mm_adds_epi16(in6, final_rounding); 3319ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in7 = _mm_adds_epi16(in7, final_rounding); 3320ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in8 = _mm_adds_epi16(in8, final_rounding); 3321ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in9 = _mm_adds_epi16(in9, final_rounding); 3322ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in10 = _mm_adds_epi16(in10, final_rounding); 3323ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in11 = _mm_adds_epi16(in11, final_rounding); 3324ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in12 = _mm_adds_epi16(in12, final_rounding); 3325ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in13 = _mm_adds_epi16(in13, final_rounding); 3326ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in14 = _mm_adds_epi16(in14, final_rounding); 3327ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in15 = _mm_adds_epi16(in15, final_rounding); 3328ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in16 = _mm_adds_epi16(in16, final_rounding); 3329ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in17 = _mm_adds_epi16(in17, final_rounding); 3330ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in18 = _mm_adds_epi16(in18, final_rounding); 3331ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in19 = _mm_adds_epi16(in19, final_rounding); 3332ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in20 = _mm_adds_epi16(in20, final_rounding); 3333ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in21 = _mm_adds_epi16(in21, final_rounding); 3334ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in22 = _mm_adds_epi16(in22, final_rounding); 3335ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in23 = _mm_adds_epi16(in23, final_rounding); 3336ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in24 = _mm_adds_epi16(in24, final_rounding); 3337ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in25 = _mm_adds_epi16(in25, final_rounding); 3338ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in26 = _mm_adds_epi16(in26, final_rounding); 3339ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in27 = _mm_adds_epi16(in27, final_rounding); 3340ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in28 = _mm_adds_epi16(in28, final_rounding); 3341ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in29 = _mm_adds_epi16(in29, final_rounding); 3342ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in30 = _mm_adds_epi16(in30, final_rounding); 3343ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in31 = _mm_adds_epi16(in31, final_rounding); 3344ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3345ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in0 = _mm_srai_epi16(in0, 6); 3346ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in1 = _mm_srai_epi16(in1, 6); 3347ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in2 = _mm_srai_epi16(in2, 6); 3348ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in3 = _mm_srai_epi16(in3, 6); 3349ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in4 = _mm_srai_epi16(in4, 6); 3350ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in5 = _mm_srai_epi16(in5, 6); 3351ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in6 = _mm_srai_epi16(in6, 6); 3352ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in7 = _mm_srai_epi16(in7, 6); 3353ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in8 = _mm_srai_epi16(in8, 6); 3354ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in9 = _mm_srai_epi16(in9, 6); 3355ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in10 = _mm_srai_epi16(in10, 6); 3356ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in11 = _mm_srai_epi16(in11, 6); 3357ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in12 = _mm_srai_epi16(in12, 6); 3358ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in13 = _mm_srai_epi16(in13, 6); 3359ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in14 = _mm_srai_epi16(in14, 6); 3360ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in15 = _mm_srai_epi16(in15, 6); 3361ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in16 = _mm_srai_epi16(in16, 6); 3362ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in17 = _mm_srai_epi16(in17, 6); 3363ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in18 = _mm_srai_epi16(in18, 6); 3364ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in19 = _mm_srai_epi16(in19, 6); 3365ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in20 = _mm_srai_epi16(in20, 6); 3366ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in21 = _mm_srai_epi16(in21, 6); 3367ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in22 = _mm_srai_epi16(in22, 6); 3368ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in23 = _mm_srai_epi16(in23, 6); 3369ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in24 = _mm_srai_epi16(in24, 6); 3370ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in25 = _mm_srai_epi16(in25, 6); 3371ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in26 = _mm_srai_epi16(in26, 6); 3372ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in27 = _mm_srai_epi16(in27, 6); 3373ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in28 = _mm_srai_epi16(in28, 6); 3374ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in29 = _mm_srai_epi16(in29, 6); 3375ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in30 = _mm_srai_epi16(in30, 6); 3376ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang in31 = _mm_srai_epi16(in31, 6); 3377ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3378ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in0); 3379ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in1); 3380ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in2); 3381ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in3); 3382ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in4); 3383ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in5); 3384ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in6); 3385ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in7); 3386ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in8); 3387ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in9); 3388ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in10); 3389ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in11); 3390ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in12); 3391ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in13); 3392ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in14); 3393ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in15); 3394ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in16); 3395ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in17); 3396ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in18); 3397ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in19); 3398ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in20); 3399ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in21); 3400ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in22); 3401ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in23); 3402ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in24); 3403ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in25); 3404ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in26); 3405ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in27); 3406ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in28); 3407ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in29); 3408ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in30); 3409ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RECON_AND_STORE(dest, in31); 3410ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 3411ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang dest += 8 - (stride * 32); 3412ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 3413ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } 3414ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang} 3415