1da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* 2da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * 4da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * Use of this source code is governed by a BSD-style license 5da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * that can be found in the LICENSE file in the root of the source 6da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * tree. An additional intellectual property rights grant can be found 7da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * in the file PATENTS. All contributing project authors may 8da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * be found in the AUTHORS file in the root of the source tree. 9da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian */ 10da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 11da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "./vpx_dsp_rtcd.h" 12da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_dsp/x86/inv_txfm_sse2.h" 13da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_dsp/x86/txfm_common_sse2.h" 14da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 15da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define RECON_AND_STORE4X4(dest, in_x) \ 16da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{ \ 17da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ 18da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d0 = _mm_unpacklo_epi8(d0, zero); \ 19da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d0 = _mm_add_epi16(in_x, d0); \ 20da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d0 = _mm_packus_epi16(d0, d0); \ 21da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *(int *)(dest) = _mm_cvtsi128_si32(d0); \ 22da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 23da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 242263fc984bdc858ee931d3e35c87c404de923950Johannvoid vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, 252263fc984bdc858ee931d3e35c87c404de923950Johann int stride) { 26da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i zero = _mm_setzero_si128(); 27da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i eight = _mm_set1_epi16(8); 28da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i cst = _mm_setr_epi16( 29da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64, 30da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64, 31da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian (int16_t)cospi_8_64, (int16_t)cospi_24_64); 32da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 33da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i input0, input1, input2, input3; 34da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 35da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Rows 362263fc984bdc858ee931d3e35c87c404de923950Johann input0 = load_input_data(input); 372263fc984bdc858ee931d3e35c87c404de923950Johann input2 = load_input_data(input + 8); 38da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 39da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Construct i3, i1, i3, i1, i2, i0, i2, i0 40da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input0 = _mm_shufflelo_epi16(input0, 0xd8); 41da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input0 = _mm_shufflehi_epi16(input0, 0xd8); 42da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input2 = _mm_shufflelo_epi16(input2, 0xd8); 43da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input2 = _mm_shufflehi_epi16(input2, 0xd8); 44da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 45da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input1 = _mm_unpackhi_epi32(input0, input0); 46da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input0 = _mm_unpacklo_epi32(input0, input0); 47da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input3 = _mm_unpackhi_epi32(input2, input2); 48da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input2 = _mm_unpacklo_epi32(input2, input2); 49da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 50da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Stage 1 51da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input0 = _mm_madd_epi16(input0, cst); 52da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input1 = _mm_madd_epi16(input1, cst); 53da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input2 = _mm_madd_epi16(input2, cst); 54da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input3 = _mm_madd_epi16(input3, cst); 55da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 56da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input0 = _mm_add_epi32(input0, rounding); 57da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input1 = _mm_add_epi32(input1, rounding); 58da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input2 = _mm_add_epi32(input2, rounding); 59da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input3 = _mm_add_epi32(input3, rounding); 60da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 61da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); 62da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); 63da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); 64da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); 65da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 66da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Stage 2 67da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input0 = _mm_packs_epi32(input0, input1); 68da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input1 = _mm_packs_epi32(input2, input3); 69da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 70da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Transpose 71da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input2 = _mm_unpacklo_epi16(input0, input1); 72da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input3 = _mm_unpackhi_epi16(input0, input1); 73da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input0 = _mm_unpacklo_epi32(input2, input3); 74da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input1 = _mm_unpackhi_epi32(input2, input3); 75da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 76da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Switch column2, column 3, and then, we got: 77da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // input2: column1, column 0; input3: column2, column 3. 78da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input1 = _mm_shuffle_epi32(input1, 0x4e); 79da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input2 = _mm_add_epi16(input0, input1); 80da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input3 = _mm_sub_epi16(input0, input1); 81da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 82da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Columns 83da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Construct i3, i1, i3, i1, i2, i0, i2, i0 84da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input0 = _mm_unpacklo_epi32(input2, input2); 85da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input1 = _mm_unpackhi_epi32(input2, input2); 86da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input2 = _mm_unpackhi_epi32(input3, input3); 87da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input3 = _mm_unpacklo_epi32(input3, input3); 88da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 89da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Stage 1 90da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input0 = _mm_madd_epi16(input0, cst); 91da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input1 = _mm_madd_epi16(input1, cst); 92da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input2 = _mm_madd_epi16(input2, cst); 93da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input3 = _mm_madd_epi16(input3, cst); 94da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 95da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input0 = _mm_add_epi32(input0, rounding); 96da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input1 = _mm_add_epi32(input1, rounding); 97da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input2 = _mm_add_epi32(input2, rounding); 98da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input3 = _mm_add_epi32(input3, rounding); 99da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 100da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); 101da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); 102da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); 103da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); 104da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 105da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Stage 2 106da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input0 = _mm_packs_epi32(input0, input2); 107da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input1 = _mm_packs_epi32(input1, input3); 108da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 109da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Transpose 110da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input2 = _mm_unpacklo_epi16(input0, input1); 111da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input3 = _mm_unpackhi_epi16(input0, input1); 112da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input0 = _mm_unpacklo_epi32(input2, input3); 113da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input1 = _mm_unpackhi_epi32(input2, input3); 114da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 115da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Switch column2, column 3, and then, we got: 116da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // input2: column1, column 0; input3: column2, column 3. 117da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input1 = _mm_shuffle_epi32(input1, 0x4e); 118da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input2 = _mm_add_epi16(input0, input1); 119da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input3 = _mm_sub_epi16(input0, input1); 120da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 121da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Final round and shift 122da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input2 = _mm_add_epi16(input2, eight); 123da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input3 = _mm_add_epi16(input3, eight); 124da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 125da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input2 = _mm_srai_epi16(input2, 4); 126da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input3 = _mm_srai_epi16(input3, 4); 127da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 128da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Reconstruction and Store 129da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { 130da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); 131da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); 132da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d0 = _mm_unpacklo_epi32(d0, 133da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian _mm_cvtsi32_si128(*(const int *)(dest + stride))); 134da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d2 = _mm_unpacklo_epi32( 135da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2); 136da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d0 = _mm_unpacklo_epi8(d0, zero); 137da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d2 = _mm_unpacklo_epi8(d2, zero); 138da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d0 = _mm_add_epi16(d0, input2); 139da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d2 = _mm_add_epi16(d2, input3); 140da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d0 = _mm_packus_epi16(d0, d2); 141da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // store input0 142da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *(int *)dest = _mm_cvtsi128_si32(d0); 143da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // store input1 144da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d0 = _mm_srli_si128(d0, 4); 145da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); 146da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // store input2 147da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d0 = _mm_srli_si128(d0, 4); 148da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); 149da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // store input3 150da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d0 = _mm_srli_si128(d0, 4); 151da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); 152da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 153da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 154da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1552263fc984bdc858ee931d3e35c87c404de923950Johannvoid vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, 1562263fc984bdc858ee931d3e35c87c404de923950Johann int stride) { 157da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i dc_value; 158da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i zero = _mm_setzero_si128(); 159da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int a; 160da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 161da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian a = dct_const_round_shift(input[0] * cospi_16_64); 162da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian a = dct_const_round_shift(a * cospi_16_64); 163da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian a = ROUND_POWER_OF_TWO(a, 4); 164da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 165da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dc_value = _mm_set1_epi16(a); 166da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 167da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE4X4(dest + 0 * stride, dc_value); 168da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE4X4(dest + 1 * stride, dc_value); 169da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE4X4(dest + 2 * stride, dc_value); 170da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE4X4(dest + 3 * stride, dc_value); 171da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 172da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 173da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic INLINE void transpose_4x4(__m128i *res) { 174da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); 175da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); 176da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 177da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); 178da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); 179da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 180da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 181da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid idct4_sse2(__m128i *in) { 182da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); 183da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 184da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 185da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 186da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 187da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i u[8], v[8]; 188da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 189da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian transpose_4x4(in); 190da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // stage 1 191da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_unpacklo_epi16(in[0], in[1]); 192da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_unpackhi_epi16(in[0], in[1]); 193da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 194da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 195da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); 196da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24); 197da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 198da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 199da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 200da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 201da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 202da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 203da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 204da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 205da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 206da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 207da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 208da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_packs_epi32(v[0], v[1]); 209da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_packs_epi32(v[3], v[2]); 210da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 211da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // stage 2 212da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[0] = _mm_add_epi16(u[0], u[1]); 213da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[1] = _mm_sub_epi16(u[0], u[1]); 214da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[1] = _mm_shuffle_epi32(in[1], 0x4E); 215da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 216da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 217da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid iadst4_sse2(__m128i *in) { 218da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); 219da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); 220da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); 221da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); 222da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); 223da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i kZero = _mm_set1_epi16(0); 224da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 225da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i u[8], v[8], in7; 226da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 227da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian transpose_4x4(in); 228da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in7 = _mm_srli_si128(in[1], 8); 229da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in7 = _mm_add_epi16(in7, in[0]); 230da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in7 = _mm_sub_epi16(in7, in[1]); 231da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 232da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_unpacklo_epi16(in[0], in[1]); 233da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_unpackhi_epi16(in[0], in[1]); 234da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[2] = _mm_unpacklo_epi16(in7, kZero); 235da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[3] = _mm_unpackhi_epi16(in[0], kZero); 236da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 237da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 238da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 239da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2 240da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4 241da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6 242da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2 243da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 244da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_add_epi32(v[0], v[1]); 245da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_add_epi32(v[3], v[4]); 246da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[2] = v[2]; 247da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[3] = _mm_add_epi32(u[0], u[1]); 248da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[4] = _mm_slli_epi32(v[5], 2); 249da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[5] = _mm_add_epi32(u[3], v[5]); 250da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[6] = _mm_sub_epi32(u[5], u[4]); 251da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 252da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 253da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 254da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 255da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 256da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 257da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 258da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 259da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 260da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 261da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 262da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[0] = _mm_packs_epi32(u[0], u[1]); 263da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[1] = _mm_packs_epi32(u[2], u[3]); 264da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 265da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 266da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ 267da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out0, out1, out2, out3, out4, out5, out6, out7) \ 268da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { \ 269da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ 270da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ 271da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ 272da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ 273da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ 274da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ 275da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ 276da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ 277da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 278da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ 279da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ 280da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ 281da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ 282da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ 283da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ 284da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ 285da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ 286da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 287da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ 288da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ 289da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ 290da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ 291da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ 292da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ 293da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ 294da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ 295da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 296da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 297da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \ 298da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out0, out1, out2, out3) \ 299da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { \ 300da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \ 301da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \ 302da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \ 303da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \ 304da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 305da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ 306da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ 307da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ 308da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ 309da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 310da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ 311da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ 312da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ 313da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ 314da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 315da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 316da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \ 317da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { \ 318da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ 319da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ 320da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ 321da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ 322da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 323da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 324da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian// Define Macro for multiplying elements by constants and adding them together. 325da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \ 326da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian cst0, cst1, cst2, cst3, res0, res1, res2, res3) \ 327da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { \ 328da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_madd_epi16(lo_0, cst0); \ 329da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = _mm_madd_epi16(hi_0, cst0); \ 330da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_madd_epi16(lo_0, cst1); \ 331da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = _mm_madd_epi16(hi_0, cst1); \ 332da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp4 = _mm_madd_epi16(lo_1, cst2); \ 333da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp5 = _mm_madd_epi16(hi_1, cst2); \ 334da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp6 = _mm_madd_epi16(lo_1, cst3); \ 335da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp7 = _mm_madd_epi16(hi_1, cst3); \ 336da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 337da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_add_epi32(tmp0, rounding); \ 338da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = _mm_add_epi32(tmp1, rounding); \ 339da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_add_epi32(tmp2, rounding); \ 340da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = _mm_add_epi32(tmp3, rounding); \ 341da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp4 = _mm_add_epi32(tmp4, rounding); \ 342da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp5 = _mm_add_epi32(tmp5, rounding); \ 343da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp6 = _mm_add_epi32(tmp6, rounding); \ 344da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp7 = _mm_add_epi32(tmp7, rounding); \ 345da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 346da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 347da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 348da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 349da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 350da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ 351da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ 352da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ 353da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ 354da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 355da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian res0 = _mm_packs_epi32(tmp0, tmp1); \ 356da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian res1 = _mm_packs_epi32(tmp2, tmp3); \ 357da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian res2 = _mm_packs_epi32(tmp4, tmp5); \ 358da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian res3 = _mm_packs_epi32(tmp6, tmp7); \ 359da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 360da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 361da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \ 362da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { \ 363da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_madd_epi16(lo_0, cst0); \ 364da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = _mm_madd_epi16(hi_0, cst0); \ 365da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_madd_epi16(lo_0, cst1); \ 366da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = _mm_madd_epi16(hi_0, cst1); \ 367da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 368da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_add_epi32(tmp0, rounding); \ 369da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = _mm_add_epi32(tmp1, rounding); \ 370da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_add_epi32(tmp2, rounding); \ 371da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = _mm_add_epi32(tmp3, rounding); \ 372da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 373da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 374da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 375da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 376da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 377da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 378da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian res0 = _mm_packs_epi32(tmp0, tmp1); \ 379da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian res1 = _mm_packs_epi32(tmp2, tmp3); \ 380da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 381da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 382da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \ 383da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out0, out1, out2, out3, out4, out5, out6, out7) \ 384da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { \ 385da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Stage1 */ \ 386da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { \ 387da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ 388da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ 389da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ 390da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ 391da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 392da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \ 393da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg1_1, stg1_2, stg1_3, stp1_4, \ 394da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_7, stp1_5, stp1_6) \ 395da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } \ 396da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 397da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Stage2 */ \ 398da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { \ 399da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ 400da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ 401da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ 402da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ 403da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 404da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \ 405da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg2_1, stg2_2, stg2_3, stp2_0, \ 406da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_1, stp2_2, stp2_3) \ 407da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 408da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ 409da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ 410da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ 411da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ 412da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } \ 413da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 414da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Stage3 */ \ 415da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { \ 416da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 417da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 418da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 419da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ 420da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ 421da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ 422da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ 423da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 424da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ 425da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ 426da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ 427da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ 428da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 429da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_add_epi32(tmp0, rounding); \ 430da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = _mm_add_epi32(tmp1, rounding); \ 431da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_add_epi32(tmp2, rounding); \ 432da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = _mm_add_epi32(tmp3, rounding); \ 433da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 434da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 435da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 436da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 437da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 438da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 439da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 440da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 441da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } \ 442da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 443da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Stage4 */ \ 444da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out0 = _mm_adds_epi16(stp1_0, stp2_7); \ 445da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out1 = _mm_adds_epi16(stp1_1, stp1_6); \ 446da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out2 = _mm_adds_epi16(stp1_2, stp1_5); \ 447da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out3 = _mm_adds_epi16(stp1_3, stp2_4); \ 448da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out4 = _mm_subs_epi16(stp1_3, stp2_4); \ 449da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out5 = _mm_subs_epi16(stp1_2, stp1_5); \ 450da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out6 = _mm_subs_epi16(stp1_1, stp1_6); \ 451da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out7 = _mm_subs_epi16(stp1_0, stp2_7); \ 452da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 453da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 4542263fc984bdc858ee931d3e35c87c404de923950Johannvoid vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, 4552263fc984bdc858ee931d3e35c87c404de923950Johann int stride) { 456da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i zero = _mm_setzero_si128(); 457da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 458da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i final_rounding = _mm_set1_epi16(1 << 4); 459da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 460da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 461da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 462da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 463da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 464da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 465da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 466da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 467da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 468da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i in0, in1, in2, in3, in4, in5, in6, in7; 469da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 470da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 471da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 472da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int i; 473da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 474da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Load input data. 4752263fc984bdc858ee931d3e35c87c404de923950Johann in0 = load_input_data(input); 4762263fc984bdc858ee931d3e35c87c404de923950Johann in1 = load_input_data(input + 8 * 1); 4772263fc984bdc858ee931d3e35c87c404de923950Johann in2 = load_input_data(input + 8 * 2); 4782263fc984bdc858ee931d3e35c87c404de923950Johann in3 = load_input_data(input + 8 * 3); 4792263fc984bdc858ee931d3e35c87c404de923950Johann in4 = load_input_data(input + 8 * 4); 4802263fc984bdc858ee931d3e35c87c404de923950Johann in5 = load_input_data(input + 8 * 5); 4812263fc984bdc858ee931d3e35c87c404de923950Johann in6 = load_input_data(input + 8 * 6); 4822263fc984bdc858ee931d3e35c87c404de923950Johann in7 = load_input_data(input + 8 * 7); 483da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 484da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 2-D 485da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 2; i++) { 486da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 8x8 Transpose is copied from vpx_fdct8x8_sse2() 487da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, 488da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in0, in1, in2, in3, in4, in5, in6, in7); 489da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 490da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 4-stage 1D idct8x8 491da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, 492da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in0, in1, in2, in3, in4, in5, in6, in7); 493da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 494da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 495da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Final rounding and shift 496da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in0 = _mm_adds_epi16(in0, final_rounding); 497da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in1 = _mm_adds_epi16(in1, final_rounding); 498da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in2 = _mm_adds_epi16(in2, final_rounding); 499da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in3 = _mm_adds_epi16(in3, final_rounding); 500da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in4 = _mm_adds_epi16(in4, final_rounding); 501da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in5 = _mm_adds_epi16(in5, final_rounding); 502da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in6 = _mm_adds_epi16(in6, final_rounding); 503da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in7 = _mm_adds_epi16(in7, final_rounding); 504da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 505da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in0 = _mm_srai_epi16(in0, 5); 506da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in1 = _mm_srai_epi16(in1, 5); 507da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in2 = _mm_srai_epi16(in2, 5); 508da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in3 = _mm_srai_epi16(in3, 5); 509da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in4 = _mm_srai_epi16(in4, 5); 510da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in5 = _mm_srai_epi16(in5, 5); 511da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in6 = _mm_srai_epi16(in6, 5); 512da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in7 = _mm_srai_epi16(in7, 5); 513da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 514da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 0 * stride, in0); 515da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 1 * stride, in1); 516da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 2 * stride, in2); 517da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 3 * stride, in3); 518da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 4 * stride, in4); 519da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 5 * stride, in5); 520da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 6 * stride, in6); 521da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 7 * stride, in7); 522da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 523da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 5242263fc984bdc858ee931d3e35c87c404de923950Johannvoid vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, 5252263fc984bdc858ee931d3e35c87c404de923950Johann int stride) { 526da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i dc_value; 527da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i zero = _mm_setzero_si128(); 528da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int a; 529da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 530da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian a = dct_const_round_shift(input[0] * cospi_16_64); 531da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian a = dct_const_round_shift(a * cospi_16_64); 532da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian a = ROUND_POWER_OF_TWO(a, 5); 533da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 534da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dc_value = _mm_set1_epi16(a); 535da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 536da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 0 * stride, dc_value); 537da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 1 * stride, dc_value); 538da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 2 * stride, dc_value); 539da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 3 * stride, dc_value); 540da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 4 * stride, dc_value); 541da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 5 * stride, dc_value); 542da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 6 * stride, dc_value); 543da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 7 * stride, dc_value); 544da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 545da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 546da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid idct8_sse2(__m128i *in) { 547da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 548da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 549da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 550da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 551da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 552da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 553da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 554da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 555da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 556da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 557da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i in0, in1, in2, in3, in4, in5, in6, in7; 558da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 559da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 560da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 561da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 562da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 8x8 Transpose is copied from vpx_fdct8x8_sse2() 563da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], 564da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in0, in1, in2, in3, in4, in5, in6, in7); 565da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 566da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 4-stage 1D idct8x8 567da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, 568da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]); 569da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 570da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 571da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid iadst8_sse2(__m128i *in) { 572da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); 573da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); 574da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); 575da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); 576da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); 577da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); 578da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); 579da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); 580da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 581da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 582da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 583da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 584da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); 585da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__const_0 = _mm_set1_epi16(0); 586da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 587da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 588da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; 589da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; 590da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; 591da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i s0, s1, s2, s3, s4, s5, s6, s7; 592da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i in0, in1, in2, in3, in4, in5, in6, in7; 593da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 594da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // transpose 595da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian array_transpose_8x8(in, in); 596da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 597da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // properly aligned for butterfly input 598da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in0 = in[7]; 599da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in1 = in[0]; 600da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in2 = in[5]; 601da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in3 = in[2]; 602da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in4 = in[3]; 603da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in5 = in[4]; 604da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in6 = in[1]; 605da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in7 = in[6]; 606da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 607da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // column transformation 608da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // stage 1 609da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // interleave and multiply/add into 32-bit integer 610da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s0 = _mm_unpacklo_epi16(in0, in1); 611da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s1 = _mm_unpackhi_epi16(in0, in1); 612da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s2 = _mm_unpacklo_epi16(in2, in3); 613da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s3 = _mm_unpackhi_epi16(in2, in3); 614da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s4 = _mm_unpacklo_epi16(in4, in5); 615da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s5 = _mm_unpackhi_epi16(in4, in5); 616da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s6 = _mm_unpacklo_epi16(in6, in7); 617da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s7 = _mm_unpackhi_epi16(in6, in7); 618da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 619da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); 620da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); 621da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); 622da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); 623da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); 624da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); 625da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); 626da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); 627da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); 628da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); 629da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); 630da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); 631da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); 632da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); 633da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); 634da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); 635da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 636da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // addition 637da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian w0 = _mm_add_epi32(u0, u8); 638da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian w1 = _mm_add_epi32(u1, u9); 639da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian w2 = _mm_add_epi32(u2, u10); 640da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian w3 = _mm_add_epi32(u3, u11); 641da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian w4 = _mm_add_epi32(u4, u12); 642da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian w5 = _mm_add_epi32(u5, u13); 643da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian w6 = _mm_add_epi32(u6, u14); 644da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian w7 = _mm_add_epi32(u7, u15); 645da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian w8 = _mm_sub_epi32(u0, u8); 646da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian w9 = _mm_sub_epi32(u1, u9); 647da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian w10 = _mm_sub_epi32(u2, u10); 648da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian w11 = _mm_sub_epi32(u3, u11); 649da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian w12 = _mm_sub_epi32(u4, u12); 650da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian w13 = _mm_sub_epi32(u5, u13); 651da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian w14 = _mm_sub_epi32(u6, u14); 652da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian w15 = _mm_sub_epi32(u7, u15); 653da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 654da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // shift and rounding 655da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 656da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 657da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 658da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 659da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 660da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 661da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 662da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 663da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); 664da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); 665da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); 666da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); 667da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); 668da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); 669da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); 670da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); 671da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 672da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 673da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 674da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 675da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 676da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 677da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 678da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 679da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 680da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); 681da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); 682da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); 683da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); 684da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); 685da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); 686da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); 687da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); 688da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 689da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // back to 16-bit and pack 8 integers into __m128i 690da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[0] = _mm_packs_epi32(u0, u1); 691da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[1] = _mm_packs_epi32(u2, u3); 692da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[2] = _mm_packs_epi32(u4, u5); 693da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[3] = _mm_packs_epi32(u6, u7); 694da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[4] = _mm_packs_epi32(u8, u9); 695da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[5] = _mm_packs_epi32(u10, u11); 696da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[6] = _mm_packs_epi32(u12, u13); 697da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[7] = _mm_packs_epi32(u14, u15); 698da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 699da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // stage 2 700da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s0 = _mm_add_epi16(in[0], in[2]); 701da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s1 = _mm_add_epi16(in[1], in[3]); 702da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s2 = _mm_sub_epi16(in[0], in[2]); 703da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s3 = _mm_sub_epi16(in[1], in[3]); 704da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u0 = _mm_unpacklo_epi16(in[4], in[5]); 705da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u1 = _mm_unpackhi_epi16(in[4], in[5]); 706da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u2 = _mm_unpacklo_epi16(in[6], in[7]); 707da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u3 = _mm_unpackhi_epi16(in[6], in[7]); 708da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 709da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); 710da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); 711da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); 712da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); 713da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); 714da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); 715da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); 716da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); 717da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 718da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian w0 = _mm_add_epi32(v0, v4); 719da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian w1 = _mm_add_epi32(v1, v5); 720da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian w2 = _mm_add_epi32(v2, v6); 721da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian w3 = _mm_add_epi32(v3, v7); 722da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian w4 = _mm_sub_epi32(v0, v4); 723da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian w5 = _mm_sub_epi32(v1, v5); 724da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian w6 = _mm_sub_epi32(v2, v6); 725da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian w7 = _mm_sub_epi32(v3, v7); 726da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 727da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 728da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 729da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 730da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 731da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 732da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 733da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 734da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 735da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 736da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 737da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 738da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 739da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 740da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 741da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 742da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 743da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 744da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 745da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // back to 16-bit intergers 746da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s4 = _mm_packs_epi32(u0, u1); 747da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s5 = _mm_packs_epi32(u2, u3); 748da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s6 = _mm_packs_epi32(u4, u5); 749da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s7 = _mm_packs_epi32(u6, u7); 750da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 751da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // stage 3 752da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u0 = _mm_unpacklo_epi16(s2, s3); 753da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u1 = _mm_unpackhi_epi16(s2, s3); 754da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u2 = _mm_unpacklo_epi16(s6, s7); 755da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u3 = _mm_unpackhi_epi16(s6, s7); 756da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 757da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); 758da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); 759da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); 760da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); 761da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); 762da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); 763da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); 764da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); 765da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 766da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 767da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 768da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 769da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 770da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); 771da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); 772da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); 773da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); 774da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 775da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 776da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 777da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 778da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 779da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); 780da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); 781da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); 782da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); 783da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 784da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s2 = _mm_packs_epi32(v0, v1); 785da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s3 = _mm_packs_epi32(v2, v3); 786da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s6 = _mm_packs_epi32(v4, v5); 787da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s7 = _mm_packs_epi32(v6, v7); 788da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 789da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[0] = s0; 790da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[1] = _mm_sub_epi16(k__const_0, s4); 791da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[2] = s6; 792da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[3] = _mm_sub_epi16(k__const_0, s2); 793da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[4] = s3; 794da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[5] = _mm_sub_epi16(k__const_0, s7); 795da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[6] = s5; 796da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[7] = _mm_sub_epi16(k__const_0, s1); 797da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 798da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 7992263fc984bdc858ee931d3e35c87c404de923950Johannvoid vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, 8002263fc984bdc858ee931d3e35c87c404de923950Johann int stride) { 801da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i zero = _mm_setzero_si128(); 802da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 803da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i final_rounding = _mm_set1_epi16(1 << 4); 804da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 805da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 806da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 807da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 808da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 809da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 810da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 811da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 812da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 813da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 814da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i in0, in1, in2, in3, in4, in5, in6, in7; 815da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 816da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 817da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 818da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 819da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Rows. Load 4-row input data. 8202263fc984bdc858ee931d3e35c87c404de923950Johann in0 = load_input_data(input); 8212263fc984bdc858ee931d3e35c87c404de923950Johann in1 = load_input_data(input + 8 * 1); 8222263fc984bdc858ee931d3e35c87c404de923950Johann in2 = load_input_data(input + 8 * 2); 8232263fc984bdc858ee931d3e35c87c404de923950Johann in3 = load_input_data(input + 8 * 3); 824da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 825da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 8x4 Transpose 826da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); 827da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Stage1 828da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { 829da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero); 830da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero); 831da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 832da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_madd_epi16(lo_17, stg1_0); 833da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_madd_epi16(lo_17, stg1_1); 834da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp4 = _mm_madd_epi16(lo_35, stg1_2); 835da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp6 = _mm_madd_epi16(lo_35, stg1_3); 836da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 837da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_add_epi32(tmp0, rounding); 838da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_add_epi32(tmp2, rounding); 839da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp4 = _mm_add_epi32(tmp4, rounding); 840da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp6 = _mm_add_epi32(tmp6, rounding); 841da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 842da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 843da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 844da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 845da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 846da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_4 = _mm_packs_epi32(tmp0, tmp2); 847da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_5 = _mm_packs_epi32(tmp4, tmp6); 848da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 849da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 850da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Stage2 851da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { 852da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero); 853da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero); 854da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 855da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_madd_epi16(lo_04, stg2_0); 856da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_madd_epi16(lo_04, stg2_1); 857da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp4 = _mm_madd_epi16(lo_26, stg2_2); 858da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp6 = _mm_madd_epi16(lo_26, stg2_3); 859da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 860da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_add_epi32(tmp0, rounding); 861da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_add_epi32(tmp2, rounding); 862da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp4 = _mm_add_epi32(tmp4, rounding); 863da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp6 = _mm_add_epi32(tmp6, rounding); 864da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 865da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 866da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 867da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 868da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 869da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_0 = _mm_packs_epi32(tmp0, tmp2); 870da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_2 = _mm_packs_epi32(tmp6, tmp4); 871da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 872da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_adds_epi16(stp1_4, stp1_5); 873da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = _mm_subs_epi16(stp1_4, stp1_5); 874da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 875da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_4 = tmp0; 876da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_5 = _mm_unpacklo_epi64(tmp1, zero); 877da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_6 = _mm_unpackhi_epi64(tmp1, zero); 878da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 879da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 880da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Stage3 881da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { 882da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); 883da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 884da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp4 = _mm_adds_epi16(stp2_0, stp2_2); 885da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp6 = _mm_subs_epi16(stp2_0, stp2_2); 886da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 887da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4); 888da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4); 889da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 890da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_madd_epi16(lo_56, stg3_0); 891da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 892da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 893da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_add_epi32(tmp0, rounding); 894da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_add_epi32(tmp2, rounding); 895da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 896da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 897da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 898da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_5 = _mm_packs_epi32(tmp0, tmp2); 899da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 900da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 901da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Stage4 902da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_adds_epi16(stp1_3, stp2_4); 903da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = _mm_adds_epi16(stp1_2, stp1_5); 904da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_subs_epi16(stp1_3, stp2_4); 905da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = _mm_subs_epi16(stp1_2, stp1_5); 906da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 907da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) 908da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 909da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, 910da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in0, in1, in2, in3, in4, in5, in6, in7); 911da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Final rounding and shift 912da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in0 = _mm_adds_epi16(in0, final_rounding); 913da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in1 = _mm_adds_epi16(in1, final_rounding); 914da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in2 = _mm_adds_epi16(in2, final_rounding); 915da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in3 = _mm_adds_epi16(in3, final_rounding); 916da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in4 = _mm_adds_epi16(in4, final_rounding); 917da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in5 = _mm_adds_epi16(in5, final_rounding); 918da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in6 = _mm_adds_epi16(in6, final_rounding); 919da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in7 = _mm_adds_epi16(in7, final_rounding); 920da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 921da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in0 = _mm_srai_epi16(in0, 5); 922da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in1 = _mm_srai_epi16(in1, 5); 923da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in2 = _mm_srai_epi16(in2, 5); 924da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in3 = _mm_srai_epi16(in3, 5); 925da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in4 = _mm_srai_epi16(in4, 5); 926da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in5 = _mm_srai_epi16(in5, 5); 927da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in6 = _mm_srai_epi16(in6, 5); 928da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in7 = _mm_srai_epi16(in7, 5); 929da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 930da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 0 * stride, in0); 931da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 1 * stride, in1); 932da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 2 * stride, in2); 933da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 3 * stride, in3); 934da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 4 * stride, in4); 935da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 5 * stride, in5); 936da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 6 * stride, in6); 937da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 7 * stride, in7); 938da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 939da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 940da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define IDCT16 \ 941da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Stage2 */ \ 942da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { \ 943da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \ 944da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \ 945da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \ 946da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \ 947da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \ 948da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \ 949da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \ 950da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \ 951da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 952da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \ 953da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg2_0, stg2_1, stg2_2, stg2_3, \ 954da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_8, stp2_15, stp2_9, stp2_14) \ 955da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 956da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \ 957da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg2_4, stg2_5, stg2_6, stg2_7, \ 958da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_10, stp2_13, stp2_11, stp2_12) \ 959da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } \ 960da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 961da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Stage3 */ \ 962da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { \ 963da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \ 964da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \ 965da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \ 966da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \ 967da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 968da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \ 969da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg3_0, stg3_1, stg3_2, stg3_3, \ 970da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_4, stp1_7, stp1_5, stp1_6) \ 971da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 972da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ 973da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ 974da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ 975da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ 976da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 977da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ 978da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ 979da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ 980da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ 981da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } \ 982da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 983da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Stage4 */ \ 984da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { \ 985da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \ 986da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \ 987da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \ 988da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \ 989da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 990da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ 991da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ 992da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 993da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 994da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 995da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \ 996da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg4_0, stg4_1, stg4_2, stg4_3, \ 997da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_0, stp2_1, stp2_2, stp2_3) \ 998da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 999da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ 1000da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ 1001da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ 1002da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ 1003da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1004da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ 1005da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg4_4, stg4_5, stg4_6, stg4_7, \ 1006da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_9, stp2_14, stp2_10, stp2_13) \ 1007da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } \ 1008da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1009da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Stage5 */ \ 1010da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { \ 1011da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 1012da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 1013da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1014da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ 1015da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ 1016da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ 1017da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ 1018da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1019da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 1020da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 1021da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 1022da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 1023da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1024da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_add_epi32(tmp0, rounding); \ 1025da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = _mm_add_epi32(tmp1, rounding); \ 1026da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_add_epi32(tmp2, rounding); \ 1027da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = _mm_add_epi32(tmp3, rounding); \ 1028da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1029da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 1030da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 1031da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 1032da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 1033da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1034da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 1035da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 1036da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1037da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ 1038da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 1039da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 1040da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ 1041da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1042da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ 1043da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 1044da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 1045da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ 1046da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } \ 1047da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1048da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Stage6 */ \ 1049da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { \ 1050da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 1051da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 1052da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 1053da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 1054da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1055da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ 1056da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 1057da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 1058da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ 1059da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ 1060da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 1061da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 1062da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ 1063da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1064da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ 1065da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg6_0, stg4_0, stg6_0, stg4_0, \ 1066da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_10, stp2_13, stp2_11, stp2_12) \ 1067da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 1068da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1069da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define IDCT16_10 \ 1070da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Stage2 */ \ 1071da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { \ 1072da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \ 1073da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \ 1074da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \ 1075da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \ 1076da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1077da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \ 1078da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg2_0, stg2_1, stg2_6, stg2_7, \ 1079da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_8_0, stp1_15, stp1_11, stp1_12_0) \ 1080da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } \ 1081da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1082da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Stage3 */ \ 1083da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { \ 1084da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \ 1085da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \ 1086da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1087da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \ 1088da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg3_0, stg3_1, \ 1089da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_4, stp2_7) \ 1090da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1091da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_9 = stp1_8_0; \ 1092da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_10 = stp1_11; \ 1093da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1094da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_13 = stp1_12_0; \ 1095da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_14 = stp1_15; \ 1096da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } \ 1097da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1098da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Stage4 */ \ 1099da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { \ 1100da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \ 1101da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \ 1102da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1103da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ 1104da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ 1105da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 1106da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 1107da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1108da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \ 1109da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg4_0, stg4_1, \ 1110da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_0, stp1_1) \ 1111da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_5 = stp2_4; \ 1112da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_6 = stp2_7; \ 1113da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1114da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ 1115da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg4_4, stg4_5, stg4_6, stg4_7, \ 1116da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_9, stp2_14, stp2_10, stp2_13) \ 1117da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } \ 1118da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1119da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Stage5 */ \ 1120da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { \ 1121da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 1122da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 1123da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1124da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_2 = stp1_1; \ 1125da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_3 = stp1_0; \ 1126da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1127da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 1128da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 1129da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 1130da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 1131da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1132da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_add_epi32(tmp0, rounding); \ 1133da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = _mm_add_epi32(tmp1, rounding); \ 1134da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_add_epi32(tmp2, rounding); \ 1135da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = _mm_add_epi32(tmp3, rounding); \ 1136da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1137da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 1138da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 1139da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 1140da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 1141da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1142da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 1143da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 1144da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1145da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ 1146da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 1147da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 1148da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ 1149da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1150da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ 1151da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 1152da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 1153da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ 1154da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } \ 1155da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1156da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Stage6 */ \ 1157da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { \ 1158da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 1159da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 1160da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 1161da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 1162da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1163da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ 1164da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 1165da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 1166da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ 1167da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ 1168da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 1169da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 1170da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ 1171da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 1172da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ 1173da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg6_0, stg4_0, stg6_0, stg4_0, \ 1174da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_10, stp2_13, stp2_11, stp2_12) \ 1175da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 1176da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 11772263fc984bdc858ee931d3e35c87c404de923950Johannvoid vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, 1178da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int stride) { 1179da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 1180da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i final_rounding = _mm_set1_epi16(1 << 5); 1181da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i zero = _mm_setzero_si128(); 1182da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1183da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 1184da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 1185da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 1186da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 1187da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 1188da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 1189da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 1190da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 1191da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1192da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1193da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 1194da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1195da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 1196da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1197da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 1198da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1199da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1200da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 1201da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1202da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 1203da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1204da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1205da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1206da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1207da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1208da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i in[16], l[16], r[16], *curr1; 1209da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 1210da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 1211da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_8_0, stp1_12_0; 1212da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 1213da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; 1214da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 1215da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int i; 1216da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1217da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian curr1 = l; 1218da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 2; i++) { 1219da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 1-D idct 1220da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1221da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Load input data. 12222263fc984bdc858ee931d3e35c87c404de923950Johann in[0] = load_input_data(input); 12232263fc984bdc858ee931d3e35c87c404de923950Johann in[8] = load_input_data(input + 8 * 1); 12242263fc984bdc858ee931d3e35c87c404de923950Johann in[1] = load_input_data(input + 8 * 2); 12252263fc984bdc858ee931d3e35c87c404de923950Johann in[9] = load_input_data(input + 8 * 3); 12262263fc984bdc858ee931d3e35c87c404de923950Johann in[2] = load_input_data(input + 8 * 4); 12272263fc984bdc858ee931d3e35c87c404de923950Johann in[10] = load_input_data(input + 8 * 5); 12282263fc984bdc858ee931d3e35c87c404de923950Johann in[3] = load_input_data(input + 8 * 6); 12292263fc984bdc858ee931d3e35c87c404de923950Johann in[11] = load_input_data(input + 8 * 7); 12302263fc984bdc858ee931d3e35c87c404de923950Johann in[4] = load_input_data(input + 8 * 8); 12312263fc984bdc858ee931d3e35c87c404de923950Johann in[12] = load_input_data(input + 8 * 9); 12322263fc984bdc858ee931d3e35c87c404de923950Johann in[5] = load_input_data(input + 8 * 10); 12332263fc984bdc858ee931d3e35c87c404de923950Johann in[13] = load_input_data(input + 8 * 11); 12342263fc984bdc858ee931d3e35c87c404de923950Johann in[6] = load_input_data(input + 8 * 12); 12352263fc984bdc858ee931d3e35c87c404de923950Johann in[14] = load_input_data(input + 8 * 13); 12362263fc984bdc858ee931d3e35c87c404de923950Johann in[7] = load_input_data(input + 8 * 14); 12372263fc984bdc858ee931d3e35c87c404de923950Johann in[15] = load_input_data(input + 8 * 15); 1238da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1239da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian array_transpose_8x8(in, in); 1240da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian array_transpose_8x8(in + 8, in + 8); 1241da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1242da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian IDCT16 1243da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1244da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Stage7 1245da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian curr1[0] = _mm_add_epi16(stp2_0, stp1_15); 1246da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian curr1[1] = _mm_add_epi16(stp2_1, stp1_14); 1247da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian curr1[2] = _mm_add_epi16(stp2_2, stp2_13); 1248da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian curr1[3] = _mm_add_epi16(stp2_3, stp2_12); 1249da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian curr1[4] = _mm_add_epi16(stp2_4, stp2_11); 1250da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian curr1[5] = _mm_add_epi16(stp2_5, stp2_10); 1251da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian curr1[6] = _mm_add_epi16(stp2_6, stp1_9); 1252da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian curr1[7] = _mm_add_epi16(stp2_7, stp1_8); 1253da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian curr1[8] = _mm_sub_epi16(stp2_7, stp1_8); 1254da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian curr1[9] = _mm_sub_epi16(stp2_6, stp1_9); 1255da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian curr1[10] = _mm_sub_epi16(stp2_5, stp2_10); 1256da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian curr1[11] = _mm_sub_epi16(stp2_4, stp2_11); 1257da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian curr1[12] = _mm_sub_epi16(stp2_3, stp2_12); 1258da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian curr1[13] = _mm_sub_epi16(stp2_2, stp2_13); 1259da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian curr1[14] = _mm_sub_epi16(stp2_1, stp1_14); 1260da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian curr1[15] = _mm_sub_epi16(stp2_0, stp1_15); 1261da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1262da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian curr1 = r; 1263da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input += 128; 1264da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 1265da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 2; i++) { 1266da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int j; 1267da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 1-D idct 1268da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian array_transpose_8x8(l + i * 8, in); 1269da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian array_transpose_8x8(r + i * 8, in + 8); 1270da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1271da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian IDCT16 1272da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1273da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 2-D 1274da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[0] = _mm_add_epi16(stp2_0, stp1_15); 1275da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[1] = _mm_add_epi16(stp2_1, stp1_14); 1276da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[2] = _mm_add_epi16(stp2_2, stp2_13); 1277da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[3] = _mm_add_epi16(stp2_3, stp2_12); 1278da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[4] = _mm_add_epi16(stp2_4, stp2_11); 1279da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[5] = _mm_add_epi16(stp2_5, stp2_10); 1280da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[6] = _mm_add_epi16(stp2_6, stp1_9); 1281da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[7] = _mm_add_epi16(stp2_7, stp1_8); 1282da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[8] = _mm_sub_epi16(stp2_7, stp1_8); 1283da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[9] = _mm_sub_epi16(stp2_6, stp1_9); 1284da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[10] = _mm_sub_epi16(stp2_5, stp2_10); 1285da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[11] = _mm_sub_epi16(stp2_4, stp2_11); 1286da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[12] = _mm_sub_epi16(stp2_3, stp2_12); 1287da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[13] = _mm_sub_epi16(stp2_2, stp2_13); 1288da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[14] = _mm_sub_epi16(stp2_1, stp1_14); 1289da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[15] = _mm_sub_epi16(stp2_0, stp1_15); 1290da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1291da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (j = 0; j < 16; ++j) { 1292da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Final rounding and shift 1293da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[j] = _mm_adds_epi16(in[j], final_rounding); 1294da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[j] = _mm_srai_epi16(in[j], 6); 1295da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + j * stride, in[j]); 1296da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 1297da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1298da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dest += 8; 1299da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 1300da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 1301da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 13022263fc984bdc858ee931d3e35c87c404de923950Johannvoid vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, 13032263fc984bdc858ee931d3e35c87c404de923950Johann int stride) { 1304da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i dc_value; 1305da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i zero = _mm_setzero_si128(); 1306da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int a, i; 1307da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1308da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian a = dct_const_round_shift(input[0] * cospi_16_64); 1309da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian a = dct_const_round_shift(a * cospi_16_64); 1310da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian a = ROUND_POWER_OF_TWO(a, 6); 1311da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1312da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dc_value = _mm_set1_epi16(a); 1313da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1314da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 2; ++i) { 1315da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 0 * stride, dc_value); 1316da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 1 * stride, dc_value); 1317da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 2 * stride, dc_value); 1318da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 3 * stride, dc_value); 1319da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 4 * stride, dc_value); 1320da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 5 * stride, dc_value); 1321da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 6 * stride, dc_value); 1322da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 7 * stride, dc_value); 1323da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 8 * stride, dc_value); 1324da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 9 * stride, dc_value); 1325da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 10 * stride, dc_value); 1326da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 11 * stride, dc_value); 1327da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 12 * stride, dc_value); 1328da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 13 * stride, dc_value); 1329da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 14 * stride, dc_value); 1330da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + 15 * stride, dc_value); 1331da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dest += 8; 1332da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 1333da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 1334da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1335da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void iadst16_8col(__m128i *in) { 1336da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // perform 16x16 1-D ADST for 8 columns 1337da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i s[16], x[16], u[32], v[32]; 1338da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); 1339da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); 1340da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); 1341da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); 1342da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); 1343da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); 1344da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); 1345da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); 1346da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); 1347da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); 1348da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); 1349da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); 1350da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); 1351da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); 1352da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); 1353da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); 1354da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); 1355da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1356da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); 1357da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1358da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); 1359da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); 1360da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 1361da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1362da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 1363da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64); 1364da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); 1365da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1366da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1367da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1368da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i kZero = _mm_set1_epi16(0); 1369da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1370da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_unpacklo_epi16(in[15], in[0]); 1371da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_unpackhi_epi16(in[15], in[0]); 1372da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[2] = _mm_unpacklo_epi16(in[13], in[2]); 1373da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[3] = _mm_unpackhi_epi16(in[13], in[2]); 1374da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[4] = _mm_unpacklo_epi16(in[11], in[4]); 1375da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[5] = _mm_unpackhi_epi16(in[11], in[4]); 1376da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[6] = _mm_unpacklo_epi16(in[9], in[6]); 1377da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[7] = _mm_unpackhi_epi16(in[9], in[6]); 1378da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[8] = _mm_unpacklo_epi16(in[7], in[8]); 1379da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[9] = _mm_unpackhi_epi16(in[7], in[8]); 1380da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[10] = _mm_unpacklo_epi16(in[5], in[10]); 1381da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[11] = _mm_unpackhi_epi16(in[5], in[10]); 1382da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[12] = _mm_unpacklo_epi16(in[3], in[12]); 1383da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[13] = _mm_unpackhi_epi16(in[3], in[12]); 1384da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[14] = _mm_unpacklo_epi16(in[1], in[14]); 1385da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[15] = _mm_unpackhi_epi16(in[1], in[14]); 1386da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1387da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); 1388da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); 1389da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); 1390da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); 1391da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); 1392da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); 1393da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); 1394da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); 1395da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); 1396da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); 1397da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); 1398da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); 1399da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); 1400da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); 1401da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); 1402da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); 1403da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); 1404da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); 1405da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); 1406da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); 1407da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); 1408da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); 1409da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); 1410da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); 1411da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); 1412da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); 1413da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); 1414da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); 1415da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); 1416da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); 1417da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); 1418da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); 1419da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1420da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_add_epi32(v[0], v[16]); 1421da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_add_epi32(v[1], v[17]); 1422da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[2] = _mm_add_epi32(v[2], v[18]); 1423da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[3] = _mm_add_epi32(v[3], v[19]); 1424da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[4] = _mm_add_epi32(v[4], v[20]); 1425da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[5] = _mm_add_epi32(v[5], v[21]); 1426da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[6] = _mm_add_epi32(v[6], v[22]); 1427da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[7] = _mm_add_epi32(v[7], v[23]); 1428da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[8] = _mm_add_epi32(v[8], v[24]); 1429da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[9] = _mm_add_epi32(v[9], v[25]); 1430da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[10] = _mm_add_epi32(v[10], v[26]); 1431da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[11] = _mm_add_epi32(v[11], v[27]); 1432da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[12] = _mm_add_epi32(v[12], v[28]); 1433da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[13] = _mm_add_epi32(v[13], v[29]); 1434da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[14] = _mm_add_epi32(v[14], v[30]); 1435da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[15] = _mm_add_epi32(v[15], v[31]); 1436da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[16] = _mm_sub_epi32(v[0], v[16]); 1437da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[17] = _mm_sub_epi32(v[1], v[17]); 1438da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[18] = _mm_sub_epi32(v[2], v[18]); 1439da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[19] = _mm_sub_epi32(v[3], v[19]); 1440da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[20] = _mm_sub_epi32(v[4], v[20]); 1441da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[21] = _mm_sub_epi32(v[5], v[21]); 1442da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[22] = _mm_sub_epi32(v[6], v[22]); 1443da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[23] = _mm_sub_epi32(v[7], v[23]); 1444da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[24] = _mm_sub_epi32(v[8], v[24]); 1445da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[25] = _mm_sub_epi32(v[9], v[25]); 1446da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[26] = _mm_sub_epi32(v[10], v[26]); 1447da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[27] = _mm_sub_epi32(v[11], v[27]); 1448da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[28] = _mm_sub_epi32(v[12], v[28]); 1449da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[29] = _mm_sub_epi32(v[13], v[29]); 1450da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[30] = _mm_sub_epi32(v[14], v[30]); 1451da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[31] = _mm_sub_epi32(v[15], v[31]); 1452da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1453da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1454da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1455da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1456da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1457da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1458da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1459da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1460da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1461da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1462da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1463da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1464da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1465da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1466da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1467da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1468da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1469da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); 1470da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); 1471da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); 1472da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); 1473da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); 1474da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); 1475da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); 1476da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); 1477da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); 1478da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); 1479da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); 1480da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); 1481da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); 1482da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); 1483da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); 1484da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); 1485da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1486da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1487da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1488da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1489da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1490da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1491da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1492da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1493da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1494da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 1495da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 1496da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 1497da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 1498da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 1499da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 1500da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 1501da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 1502da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); 1503da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); 1504da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); 1505da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); 1506da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); 1507da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); 1508da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); 1509da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); 1510da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); 1511da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); 1512da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); 1513da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); 1514da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); 1515da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); 1516da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); 1517da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); 1518da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1519da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[0] = _mm_packs_epi32(u[0], u[1]); 1520da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[1] = _mm_packs_epi32(u[2], u[3]); 1521da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[2] = _mm_packs_epi32(u[4], u[5]); 1522da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[3] = _mm_packs_epi32(u[6], u[7]); 1523da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[4] = _mm_packs_epi32(u[8], u[9]); 1524da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[5] = _mm_packs_epi32(u[10], u[11]); 1525da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[6] = _mm_packs_epi32(u[12], u[13]); 1526da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[7] = _mm_packs_epi32(u[14], u[15]); 1527da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[8] = _mm_packs_epi32(u[16], u[17]); 1528da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[9] = _mm_packs_epi32(u[18], u[19]); 1529da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[10] = _mm_packs_epi32(u[20], u[21]); 1530da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[11] = _mm_packs_epi32(u[22], u[23]); 1531da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[12] = _mm_packs_epi32(u[24], u[25]); 1532da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[13] = _mm_packs_epi32(u[26], u[27]); 1533da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[14] = _mm_packs_epi32(u[28], u[29]); 1534da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[15] = _mm_packs_epi32(u[30], u[31]); 1535da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1536da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // stage 2 1537da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_unpacklo_epi16(s[8], s[9]); 1538da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_unpackhi_epi16(s[8], s[9]); 1539da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[2] = _mm_unpacklo_epi16(s[10], s[11]); 1540da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[3] = _mm_unpackhi_epi16(s[10], s[11]); 1541da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[4] = _mm_unpacklo_epi16(s[12], s[13]); 1542da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[5] = _mm_unpackhi_epi16(s[12], s[13]); 1543da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[6] = _mm_unpacklo_epi16(s[14], s[15]); 1544da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[7] = _mm_unpackhi_epi16(s[14], s[15]); 1545da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1546da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); 1547da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); 1548da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); 1549da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); 1550da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); 1551da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); 1552da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); 1553da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); 1554da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); 1555da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); 1556da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); 1557da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); 1558da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); 1559da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); 1560da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); 1561da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); 1562da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1563da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_add_epi32(v[0], v[8]); 1564da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_add_epi32(v[1], v[9]); 1565da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[2] = _mm_add_epi32(v[2], v[10]); 1566da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[3] = _mm_add_epi32(v[3], v[11]); 1567da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[4] = _mm_add_epi32(v[4], v[12]); 1568da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[5] = _mm_add_epi32(v[5], v[13]); 1569da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[6] = _mm_add_epi32(v[6], v[14]); 1570da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[7] = _mm_add_epi32(v[7], v[15]); 1571da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[8] = _mm_sub_epi32(v[0], v[8]); 1572da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[9] = _mm_sub_epi32(v[1], v[9]); 1573da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[10] = _mm_sub_epi32(v[2], v[10]); 1574da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[11] = _mm_sub_epi32(v[3], v[11]); 1575da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[12] = _mm_sub_epi32(v[4], v[12]); 1576da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[13] = _mm_sub_epi32(v[5], v[13]); 1577da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[14] = _mm_sub_epi32(v[6], v[14]); 1578da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[15] = _mm_sub_epi32(v[7], v[15]); 1579da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1580da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1581da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1582da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1583da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1584da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1585da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1586da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1587da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1588da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1589da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1590da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1591da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1592da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1593da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1594da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1595da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1596da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1597da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1598da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1599da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1600da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1601da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1602da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1603da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1604da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1605da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 1606da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 1607da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 1608da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 1609da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 1610da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 1611da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 1612da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 1613da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1614da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian x[0] = _mm_add_epi16(s[0], s[4]); 1615da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian x[1] = _mm_add_epi16(s[1], s[5]); 1616da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian x[2] = _mm_add_epi16(s[2], s[6]); 1617da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian x[3] = _mm_add_epi16(s[3], s[7]); 1618da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian x[4] = _mm_sub_epi16(s[0], s[4]); 1619da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian x[5] = _mm_sub_epi16(s[1], s[5]); 1620da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian x[6] = _mm_sub_epi16(s[2], s[6]); 1621da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian x[7] = _mm_sub_epi16(s[3], s[7]); 1622da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian x[8] = _mm_packs_epi32(u[0], u[1]); 1623da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian x[9] = _mm_packs_epi32(u[2], u[3]); 1624da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian x[10] = _mm_packs_epi32(u[4], u[5]); 1625da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian x[11] = _mm_packs_epi32(u[6], u[7]); 1626da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian x[12] = _mm_packs_epi32(u[8], u[9]); 1627da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian x[13] = _mm_packs_epi32(u[10], u[11]); 1628da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian x[14] = _mm_packs_epi32(u[12], u[13]); 1629da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian x[15] = _mm_packs_epi32(u[14], u[15]); 1630da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1631da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // stage 3 1632da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_unpacklo_epi16(x[4], x[5]); 1633da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_unpackhi_epi16(x[4], x[5]); 1634da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[2] = _mm_unpacklo_epi16(x[6], x[7]); 1635da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[3] = _mm_unpackhi_epi16(x[6], x[7]); 1636da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[4] = _mm_unpacklo_epi16(x[12], x[13]); 1637da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[5] = _mm_unpackhi_epi16(x[12], x[13]); 1638da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[6] = _mm_unpacklo_epi16(x[14], x[15]); 1639da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[7] = _mm_unpackhi_epi16(x[14], x[15]); 1640da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1641da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); 1642da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); 1643da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); 1644da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); 1645da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); 1646da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); 1647da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); 1648da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); 1649da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); 1650da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); 1651da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); 1652da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); 1653da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); 1654da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); 1655da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); 1656da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); 1657da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1658da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_add_epi32(v[0], v[4]); 1659da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_add_epi32(v[1], v[5]); 1660da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[2] = _mm_add_epi32(v[2], v[6]); 1661da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[3] = _mm_add_epi32(v[3], v[7]); 1662da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[4] = _mm_sub_epi32(v[0], v[4]); 1663da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[5] = _mm_sub_epi32(v[1], v[5]); 1664da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[6] = _mm_sub_epi32(v[2], v[6]); 1665da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[7] = _mm_sub_epi32(v[3], v[7]); 1666da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[8] = _mm_add_epi32(v[8], v[12]); 1667da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[9] = _mm_add_epi32(v[9], v[13]); 1668da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[10] = _mm_add_epi32(v[10], v[14]); 1669da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[11] = _mm_add_epi32(v[11], v[15]); 1670da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[12] = _mm_sub_epi32(v[8], v[12]); 1671da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[13] = _mm_sub_epi32(v[9], v[13]); 1672da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[14] = _mm_sub_epi32(v[10], v[14]); 1673da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[15] = _mm_sub_epi32(v[11], v[15]); 1674da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1675da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1676da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1677da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1678da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1679da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1680da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1681da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1682da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1683da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1684da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1685da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1686da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1687da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1688da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1689da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1690da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1691da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1692da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1693da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1694da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1695da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1696da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1697da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1698da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1699da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1700da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1701da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1702da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1703da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1704da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1705da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1706da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1707da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1708da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1709da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[0] = _mm_add_epi16(x[0], x[2]); 1710da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[1] = _mm_add_epi16(x[1], x[3]); 1711da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[2] = _mm_sub_epi16(x[0], x[2]); 1712da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[3] = _mm_sub_epi16(x[1], x[3]); 1713da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[4] = _mm_packs_epi32(v[0], v[1]); 1714da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[5] = _mm_packs_epi32(v[2], v[3]); 1715da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[6] = _mm_packs_epi32(v[4], v[5]); 1716da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[7] = _mm_packs_epi32(v[6], v[7]); 1717da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[8] = _mm_add_epi16(x[8], x[10]); 1718da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[9] = _mm_add_epi16(x[9], x[11]); 1719da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[10] = _mm_sub_epi16(x[8], x[10]); 1720da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[11] = _mm_sub_epi16(x[9], x[11]); 1721da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[12] = _mm_packs_epi32(v[8], v[9]); 1722da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[13] = _mm_packs_epi32(v[10], v[11]); 1723da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[14] = _mm_packs_epi32(v[12], v[13]); 1724da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[15] = _mm_packs_epi32(v[14], v[15]); 1725da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1726da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // stage 4 1727da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_unpacklo_epi16(s[2], s[3]); 1728da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_unpackhi_epi16(s[2], s[3]); 1729da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[2] = _mm_unpacklo_epi16(s[6], s[7]); 1730da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[3] = _mm_unpackhi_epi16(s[6], s[7]); 1731da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[4] = _mm_unpacklo_epi16(s[10], s[11]); 1732da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[5] = _mm_unpackhi_epi16(s[10], s[11]); 1733da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[6] = _mm_unpacklo_epi16(s[14], s[15]); 1734da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[7] = _mm_unpackhi_epi16(s[14], s[15]); 1735da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1736da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); 1737da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); 1738da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 1739da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); 1740da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 1741da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 1742da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 1743da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 1744da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); 1745da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); 1746da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); 1747da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); 1748da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); 1749da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); 1750da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); 1751da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); 1752da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1753da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1754da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1755da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1756da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1757da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1758da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1759da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1760da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1761da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 1762da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 1763da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 1764da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 1765da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 1766da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 1767da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 1768da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 1769da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1770da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1771da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1772da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1773da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1774da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1775da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1776da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1777da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1778da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1779da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1780da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1781da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1782da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1783da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1784da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1785da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1786da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1787da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[0] = s[0]; 1788da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[1] = _mm_sub_epi16(kZero, s[8]); 1789da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[2] = s[12]; 1790da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[3] = _mm_sub_epi16(kZero, s[4]); 1791da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[4] = _mm_packs_epi32(v[4], v[5]); 1792da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[5] = _mm_packs_epi32(v[12], v[13]); 1793da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[6] = _mm_packs_epi32(v[8], v[9]); 1794da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[7] = _mm_packs_epi32(v[0], v[1]); 1795da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[8] = _mm_packs_epi32(v[2], v[3]); 1796da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[9] = _mm_packs_epi32(v[10], v[11]); 1797da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[10] = _mm_packs_epi32(v[14], v[15]); 1798da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[11] = _mm_packs_epi32(v[6], v[7]); 1799da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[12] = s[5]; 1800da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[13] = _mm_sub_epi16(kZero, s[13]); 1801da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[14] = s[9]; 1802da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[15] = _mm_sub_epi16(kZero, s[1]); 1803da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 1804da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1805da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void idct16_8col(__m128i *in) { 1806da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); 1807da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); 1808da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); 1809da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); 1810da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); 1811da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); 1812da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); 1813da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); 1814da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1815da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); 1816da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1817da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); 1818da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); 1819da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1820da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1821da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 1822da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1823da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 1824da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1825da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1826da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1827da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i v[16], u[16], s[16], t[16]; 1828da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1829da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // stage 1 1830da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[0] = in[0]; 1831da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[1] = in[8]; 1832da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[2] = in[4]; 1833da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[3] = in[12]; 1834da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[4] = in[2]; 1835da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[5] = in[10]; 1836da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[6] = in[6]; 1837da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[7] = in[14]; 1838da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[8] = in[1]; 1839da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[9] = in[9]; 1840da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[10] = in[5]; 1841da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[11] = in[13]; 1842da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[12] = in[3]; 1843da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[13] = in[11]; 1844da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[14] = in[7]; 1845da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[15] = in[15]; 1846da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1847da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // stage 2 1848da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_unpacklo_epi16(s[8], s[15]); 1849da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_unpackhi_epi16(s[8], s[15]); 1850da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[2] = _mm_unpacklo_epi16(s[9], s[14]); 1851da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[3] = _mm_unpackhi_epi16(s[9], s[14]); 1852da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[4] = _mm_unpacklo_epi16(s[10], s[13]); 1853da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[5] = _mm_unpackhi_epi16(s[10], s[13]); 1854da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[6] = _mm_unpacklo_epi16(s[11], s[12]); 1855da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[7] = _mm_unpackhi_epi16(s[11], s[12]); 1856da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1857da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02); 1858da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02); 1859da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30); 1860da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30); 1861da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18); 1862da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18); 1863da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14); 1864da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14); 1865da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10); 1866da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10); 1867da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22); 1868da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22); 1869da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26); 1870da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26); 1871da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06); 1872da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06); 1873da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1874da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1875da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1876da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1877da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1878da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1879da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1880da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1881da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1882da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 1883da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 1884da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 1885da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 1886da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 1887da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 1888da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 1889da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 1890da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1891da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1892da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1893da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1894da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1895da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1896da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1897da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1898da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1899da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1900da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1901da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1902da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1903da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1904da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1905da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1906da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1907da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1908da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[8] = _mm_packs_epi32(u[0], u[1]); 1909da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[15] = _mm_packs_epi32(u[2], u[3]); 1910da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[9] = _mm_packs_epi32(u[4], u[5]); 1911da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[14] = _mm_packs_epi32(u[6], u[7]); 1912da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[10] = _mm_packs_epi32(u[8], u[9]); 1913da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[13] = _mm_packs_epi32(u[10], u[11]); 1914da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[11] = _mm_packs_epi32(u[12], u[13]); 1915da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[12] = _mm_packs_epi32(u[14], u[15]); 1916da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1917da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // stage 3 1918da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[0] = s[0]; 1919da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[1] = s[1]; 1920da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[2] = s[2]; 1921da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[3] = s[3]; 1922da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_unpacklo_epi16(s[4], s[7]); 1923da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_unpackhi_epi16(s[4], s[7]); 1924da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[2] = _mm_unpacklo_epi16(s[5], s[6]); 1925da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[3] = _mm_unpackhi_epi16(s[5], s[6]); 1926da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1927da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04); 1928da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04); 1929da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28); 1930da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28); 1931da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20); 1932da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20); 1933da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12); 1934da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12); 1935da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1936da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1937da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1938da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1939da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1940da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1941da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1942da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1943da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1944da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1945da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1946da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1947da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1948da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1949da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1950da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1951da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1952da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1953da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1954da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[4] = _mm_packs_epi32(u[0], u[1]); 1955da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[7] = _mm_packs_epi32(u[2], u[3]); 1956da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[5] = _mm_packs_epi32(u[4], u[5]); 1957da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[6] = _mm_packs_epi32(u[6], u[7]); 1958da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[8] = _mm_add_epi16(s[8], s[9]); 1959da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[9] = _mm_sub_epi16(s[8], s[9]); 1960da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[10] = _mm_sub_epi16(s[11], s[10]); 1961da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[11] = _mm_add_epi16(s[10], s[11]); 1962da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[12] = _mm_add_epi16(s[12], s[13]); 1963da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[13] = _mm_sub_epi16(s[12], s[13]); 1964da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[14] = _mm_sub_epi16(s[15], s[14]); 1965da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[15] = _mm_add_epi16(s[14], s[15]); 1966da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1967da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // stage 4 1968da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_unpacklo_epi16(t[0], t[1]); 1969da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_unpackhi_epi16(t[0], t[1]); 1970da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[2] = _mm_unpacklo_epi16(t[2], t[3]); 1971da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[3] = _mm_unpackhi_epi16(t[2], t[3]); 1972da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[4] = _mm_unpacklo_epi16(t[9], t[14]); 1973da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[5] = _mm_unpackhi_epi16(t[9], t[14]); 1974da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[6] = _mm_unpacklo_epi16(t[10], t[13]); 1975da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[7] = _mm_unpackhi_epi16(t[10], t[13]); 1976da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1977da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 1978da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 1979da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 1980da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); 1981da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08); 1982da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08); 1983da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); 1984da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); 1985da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24); 1986da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24); 1987da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08); 1988da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08); 1989da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08); 1990da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08); 1991da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24); 1992da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24); 1993da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1994da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1995da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1996da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1997da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1998da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1999da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 2000da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 2001da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 2002da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 2003da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 2004da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 2005da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 2006da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 2007da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 2008da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 2009da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 2010da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2011da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2012da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2013da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2014da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2015da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 2016da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 2017da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 2018da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 2019da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 2020da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 2021da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 2022da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 2023da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 2024da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 2025da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 2026da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 2027da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2028da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[0] = _mm_packs_epi32(u[0], u[1]); 2029da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[1] = _mm_packs_epi32(u[2], u[3]); 2030da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[2] = _mm_packs_epi32(u[4], u[5]); 2031da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[3] = _mm_packs_epi32(u[6], u[7]); 2032da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[4] = _mm_add_epi16(t[4], t[5]); 2033da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[5] = _mm_sub_epi16(t[4], t[5]); 2034da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[6] = _mm_sub_epi16(t[7], t[6]); 2035da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[7] = _mm_add_epi16(t[6], t[7]); 2036da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[8] = t[8]; 2037da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[15] = t[15]; 2038da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[9] = _mm_packs_epi32(u[8], u[9]); 2039da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[14] = _mm_packs_epi32(u[10], u[11]); 2040da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[10] = _mm_packs_epi32(u[12], u[13]); 2041da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[13] = _mm_packs_epi32(u[14], u[15]); 2042da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[11] = t[11]; 2043da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[12] = t[12]; 2044da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2045da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // stage 5 2046da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[0] = _mm_add_epi16(s[0], s[3]); 2047da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[1] = _mm_add_epi16(s[1], s[2]); 2048da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[2] = _mm_sub_epi16(s[1], s[2]); 2049da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[3] = _mm_sub_epi16(s[0], s[3]); 2050da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[4] = s[4]; 2051da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[7] = s[7]; 2052da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2053da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_unpacklo_epi16(s[5], s[6]); 2054da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_unpackhi_epi16(s[5], s[6]); 2055da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 2056da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 2057da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 2058da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 2059da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2060da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2061da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2062da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2063da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2064da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2065da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2066da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2067da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[5] = _mm_packs_epi32(u[0], u[1]); 2068da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[6] = _mm_packs_epi32(u[2], u[3]); 2069da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2070da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[8] = _mm_add_epi16(s[8], s[11]); 2071da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[9] = _mm_add_epi16(s[9], s[10]); 2072da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[10] = _mm_sub_epi16(s[9], s[10]); 2073da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[11] = _mm_sub_epi16(s[8], s[11]); 2074da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[12] = _mm_sub_epi16(s[15], s[12]); 2075da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[13] = _mm_sub_epi16(s[14], s[13]); 2076da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[14] = _mm_add_epi16(s[13], s[14]); 2077da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian t[15] = _mm_add_epi16(s[12], s[15]); 2078da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2079da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // stage 6 2080da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[0] = _mm_add_epi16(t[0], t[7]); 2081da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[1] = _mm_add_epi16(t[1], t[6]); 2082da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[2] = _mm_add_epi16(t[2], t[5]); 2083da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[3] = _mm_add_epi16(t[3], t[4]); 2084da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[4] = _mm_sub_epi16(t[3], t[4]); 2085da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[5] = _mm_sub_epi16(t[2], t[5]); 2086da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[6] = _mm_sub_epi16(t[1], t[6]); 2087da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[7] = _mm_sub_epi16(t[0], t[7]); 2088da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[8] = t[8]; 2089da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[9] = t[9]; 2090da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2091da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_unpacklo_epi16(t[10], t[13]); 2092da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_unpackhi_epi16(t[10], t[13]); 2093da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[2] = _mm_unpacklo_epi16(t[11], t[12]); 2094da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[3] = _mm_unpackhi_epi16(t[11], t[12]); 2095da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2096da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 2097da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 2098da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 2099da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 2100da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 2101da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 2102da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 2103da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 2104da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2105da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2106da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2107da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2108da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2109da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 2110da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 2111da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 2112da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 2113da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2114da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2115da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2116da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2117da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2118da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 2119da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 2120da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 2121da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 2122da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2123da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[10] = _mm_packs_epi32(u[0], u[1]); 2124da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[13] = _mm_packs_epi32(u[2], u[3]); 2125da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[11] = _mm_packs_epi32(u[4], u[5]); 2126da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[12] = _mm_packs_epi32(u[6], u[7]); 2127da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[14] = t[14]; 2128da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian s[15] = t[15]; 2129da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2130da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // stage 7 2131da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[0] = _mm_add_epi16(s[0], s[15]); 2132da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[1] = _mm_add_epi16(s[1], s[14]); 2133da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[2] = _mm_add_epi16(s[2], s[13]); 2134da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[3] = _mm_add_epi16(s[3], s[12]); 2135da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[4] = _mm_add_epi16(s[4], s[11]); 2136da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[5] = _mm_add_epi16(s[5], s[10]); 2137da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[6] = _mm_add_epi16(s[6], s[9]); 2138da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[7] = _mm_add_epi16(s[7], s[8]); 2139da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[8] = _mm_sub_epi16(s[7], s[8]); 2140da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[9] = _mm_sub_epi16(s[6], s[9]); 2141da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[10] = _mm_sub_epi16(s[5], s[10]); 2142da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[11] = _mm_sub_epi16(s[4], s[11]); 2143da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[12] = _mm_sub_epi16(s[3], s[12]); 2144da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[13] = _mm_sub_epi16(s[2], s[13]); 2145da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[14] = _mm_sub_epi16(s[1], s[14]); 2146da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[15] = _mm_sub_epi16(s[0], s[15]); 2147da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 2148da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2149da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid idct16_sse2(__m128i *in0, __m128i *in1) { 2150da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian array_transpose_16x16(in0, in1); 2151da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct16_8col(in0); 2152da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct16_8col(in1); 2153da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 2154da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2155da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid iadst16_sse2(__m128i *in0, __m128i *in1) { 2156da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian array_transpose_16x16(in0, in1); 2157da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian iadst16_8col(in0); 2158da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian iadst16_8col(in1); 2159da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 2160da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 21612263fc984bdc858ee931d3e35c87c404de923950Johannvoid vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, 2162da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int stride) { 2163da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 2164da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i final_rounding = _mm_set1_epi16(1 << 5); 2165da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i zero = _mm_setzero_si128(); 2166da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2167da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 2168da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 2169da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 2170da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 2171da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2172da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 2173da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 2174da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2175da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 2176da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 2177da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 2178da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 2179da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 2180da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); 2181da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2182da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 2183da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i in[16], l[16]; 2184da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, 2185da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 2186da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_8_0, stp1_12_0; 2187da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 2188da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14; 2189da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 2190da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int i; 2191da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // First 1-D inverse DCT 2192da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Load input data. 21932263fc984bdc858ee931d3e35c87c404de923950Johann in[0] = load_input_data(input); 21942263fc984bdc858ee931d3e35c87c404de923950Johann in[1] = load_input_data(input + 8 * 2); 21952263fc984bdc858ee931d3e35c87c404de923950Johann in[2] = load_input_data(input + 8 * 4); 21962263fc984bdc858ee931d3e35c87c404de923950Johann in[3] = load_input_data(input + 8 * 6); 2197da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2198da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]); 2199da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2200da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Stage2 2201da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { 2202da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero); 2203da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]); 2204da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2205da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); 2206da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); 2207da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); 2208da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); 2209da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2210da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_add_epi32(tmp0, rounding); 2211da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_add_epi32(tmp2, rounding); 2212da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp5 = _mm_add_epi32(tmp5, rounding); 2213da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp7 = _mm_add_epi32(tmp7, rounding); 2214da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2215da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2216da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2217da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); 2218da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); 2219da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2220da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_8 = _mm_packs_epi32(tmp0, tmp2); 2221da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_11 = _mm_packs_epi32(tmp5, tmp7); 2222da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 2223da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2224da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Stage3 2225da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { 2226da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero); 2227da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2228da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); 2229da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); 2230da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2231da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_add_epi32(tmp0, rounding); 2232da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_add_epi32(tmp2, rounding); 2233da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2234da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2235da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2236da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_13 = _mm_unpackhi_epi64(stp2_11, zero); 2237da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_14 = _mm_unpackhi_epi64(stp2_8, zero); 2238da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2239da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_4 = _mm_packs_epi32(tmp0, tmp2); 2240da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 2241da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2242da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Stage4 2243da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { 2244da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); 2245da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14); 2246da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13); 2247da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2248da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); 2249da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); 2250da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); 2251da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); 2252da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); 2253da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp7 = _mm_madd_epi16(lo_10_13, stg4_7); 2254da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2255da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_add_epi32(tmp0, rounding); 2256da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_add_epi32(tmp2, rounding); 2257da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = _mm_add_epi32(tmp1, rounding); 2258da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = _mm_add_epi32(tmp3, rounding); 2259da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp5 = _mm_add_epi32(tmp5, rounding); 2260da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp7 = _mm_add_epi32(tmp7, rounding); 2261da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2262da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2263da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2264da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); 2265da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 2266da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); 2267da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); 2268da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2269da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_0 = _mm_packs_epi32(tmp0, tmp0); 2270da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_1 = _mm_packs_epi32(tmp2, tmp2); 2271da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_9 = _mm_packs_epi32(tmp1, tmp3); 2272da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_10 = _mm_packs_epi32(tmp5, tmp7); 2273da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2274da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_6 = _mm_unpackhi_epi64(stp1_4, zero); 2275da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 2276da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2277da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Stage5 and Stage6 2278da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { 2279da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_add_epi16(stp2_8, stp2_11); 2280da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = _mm_sub_epi16(stp2_8, stp2_11); 2281da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_add_epi16(stp2_9, stp2_10); 2282da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = _mm_sub_epi16(stp2_9, stp2_10); 2283da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2284da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_9 = _mm_unpacklo_epi64(tmp2, zero); 2285da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_10 = _mm_unpacklo_epi64(tmp3, zero); 2286da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_8 = _mm_unpacklo_epi64(tmp0, zero); 2287da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_11 = _mm_unpacklo_epi64(tmp1, zero); 2288da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2289da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_13 = _mm_unpackhi_epi64(tmp3, zero); 2290da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_14 = _mm_unpackhi_epi64(tmp2, zero); 2291da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_12 = _mm_unpackhi_epi64(tmp1, zero); 2292da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_15 = _mm_unpackhi_epi64(tmp0, zero); 2293da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 2294da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2295da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Stage6 2296da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { 2297da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4); 2298da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); 2299da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); 2300da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2301da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = _mm_madd_epi16(lo_6_5, stg4_1); 2302da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = _mm_madd_epi16(lo_6_5, stg4_0); 2303da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); 2304da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); 2305da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); 2306da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); 2307da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2308da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = _mm_add_epi32(tmp1, rounding); 2309da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = _mm_add_epi32(tmp3, rounding); 2310da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_add_epi32(tmp0, rounding); 2311da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_add_epi32(tmp2, rounding); 2312da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp4 = _mm_add_epi32(tmp4, rounding); 2313da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp6 = _mm_add_epi32(tmp6, rounding); 2314da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2315da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); 2316da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 2317da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2318da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2319da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 2320da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 2321da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2322da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_6 = _mm_packs_epi32(tmp3, tmp1); 2323da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2324da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_10 = _mm_packs_epi32(tmp0, zero); 2325da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_13 = _mm_packs_epi32(tmp2, zero); 2326da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_11 = _mm_packs_epi32(tmp4, zero); 2327da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_12 = _mm_packs_epi32(tmp6, zero); 2328da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2329da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_add_epi16(stp1_0, stp1_4); 2330da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = _mm_sub_epi16(stp1_0, stp1_4); 2331da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_add_epi16(stp1_1, stp1_6); 2332da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = _mm_sub_epi16(stp1_1, stp1_6); 2333da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2334da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_0 = _mm_unpackhi_epi64(tmp0, zero); 2335da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_1 = _mm_unpacklo_epi64(tmp2, zero); 2336da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_2 = _mm_unpackhi_epi64(tmp2, zero); 2337da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_3 = _mm_unpacklo_epi64(tmp0, zero); 2338da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_4 = _mm_unpacklo_epi64(tmp1, zero); 2339da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_5 = _mm_unpackhi_epi64(tmp3, zero); 2340da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_6 = _mm_unpacklo_epi64(tmp3, zero); 2341da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_7 = _mm_unpackhi_epi64(tmp1, zero); 2342da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 2343da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2344da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Stage7. Left 8x16 only. 2345da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian l[0] = _mm_add_epi16(stp2_0, stp1_15); 2346da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian l[1] = _mm_add_epi16(stp2_1, stp1_14); 2347da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian l[2] = _mm_add_epi16(stp2_2, stp2_13); 2348da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian l[3] = _mm_add_epi16(stp2_3, stp2_12); 2349da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian l[4] = _mm_add_epi16(stp2_4, stp2_11); 2350da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian l[5] = _mm_add_epi16(stp2_5, stp2_10); 2351da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian l[6] = _mm_add_epi16(stp2_6, stp1_9); 2352da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian l[7] = _mm_add_epi16(stp2_7, stp1_8); 2353da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian l[8] = _mm_sub_epi16(stp2_7, stp1_8); 2354da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian l[9] = _mm_sub_epi16(stp2_6, stp1_9); 2355da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian l[10] = _mm_sub_epi16(stp2_5, stp2_10); 2356da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian l[11] = _mm_sub_epi16(stp2_4, stp2_11); 2357da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian l[12] = _mm_sub_epi16(stp2_3, stp2_12); 2358da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian l[13] = _mm_sub_epi16(stp2_2, stp2_13); 2359da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian l[14] = _mm_sub_epi16(stp2_1, stp1_14); 2360da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian l[15] = _mm_sub_epi16(stp2_0, stp1_15); 2361da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2362da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Second 1-D inverse transform, performed per 8x16 block 2363da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 2; i++) { 2364da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int j; 2365da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian array_transpose_4X8(l + 8 * i, in); 2366da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2367da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian IDCT16_10 2368da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2369da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Stage7 2370da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[0] = _mm_add_epi16(stp2_0, stp1_15); 2371da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[1] = _mm_add_epi16(stp2_1, stp1_14); 2372da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[2] = _mm_add_epi16(stp2_2, stp2_13); 2373da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[3] = _mm_add_epi16(stp2_3, stp2_12); 2374da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[4] = _mm_add_epi16(stp2_4, stp2_11); 2375da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[5] = _mm_add_epi16(stp2_5, stp2_10); 2376da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[6] = _mm_add_epi16(stp2_6, stp1_9); 2377da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[7] = _mm_add_epi16(stp2_7, stp1_8); 2378da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[8] = _mm_sub_epi16(stp2_7, stp1_8); 2379da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[9] = _mm_sub_epi16(stp2_6, stp1_9); 2380da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[10] = _mm_sub_epi16(stp2_5, stp2_10); 2381da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[11] = _mm_sub_epi16(stp2_4, stp2_11); 2382da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[12] = _mm_sub_epi16(stp2_3, stp2_12); 2383da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[13] = _mm_sub_epi16(stp2_2, stp2_13); 2384da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[14] = _mm_sub_epi16(stp2_1, stp1_14); 2385da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[15] = _mm_sub_epi16(stp2_0, stp1_15); 2386da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2387da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (j = 0; j < 16; ++j) { 2388da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Final rounding and shift 2389da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[j] = _mm_adds_epi16(in[j], final_rounding); 2390da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[j] = _mm_srai_epi16(in[j], 6); 2391da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + j * stride, in[j]); 2392da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 2393da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2394da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dest += 8; 2395da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 2396da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 2397da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2398da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define LOAD_DQCOEFF(reg, input) \ 2399da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { \ 24002263fc984bdc858ee931d3e35c87c404de923950Johann reg = load_input_data(input); \ 2401da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input += 8; \ 2402da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } \ 2403da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2404da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define IDCT32_34 \ 2405da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* Stage1 */ \ 2406da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{ \ 2407da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i zero = _mm_setzero_si128();\ 2408da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \ 2409da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \ 2410da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2411da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \ 2412da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \ 2413da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2414da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \ 2415da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \ 2416da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2417da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \ 2418da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \ 2419da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2420da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \ 2421da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg1_1, stp1_16, stp1_31); \ 2422da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \ 2423da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg1_7, stp1_19, stp1_28); \ 2424da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \ 2425da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg1_9, stp1_20, stp1_27); \ 2426da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \ 2427da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg1_15, stp1_23, stp1_24); \ 2428da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} \ 2429da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian\ 2430da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* Stage2 */ \ 2431da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{ \ 2432da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i zero = _mm_setzero_si128();\ 2433da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \ 2434da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \ 2435da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2436da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \ 2437da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \ 2438da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2439da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \ 2440da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg2_1, stp2_8, stp2_15); \ 2441da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \ 2442da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg2_7, stp2_11, stp2_12); \ 2443da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2444da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_16 = stp1_16; \ 2445da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_19 = stp1_19; \ 2446da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2447da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_20 = stp1_20; \ 2448da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_23 = stp1_23; \ 2449da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2450da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_24 = stp1_24; \ 2451da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_27 = stp1_27; \ 2452da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2453da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_28 = stp1_28; \ 2454da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_31 = stp1_31; \ 2455da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} \ 2456da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian\ 2457da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* Stage3 */ \ 2458da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{ \ 2459da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i zero = _mm_setzero_si128();\ 2460da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \ 2461da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \ 2462da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2463da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \ 2464da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \ 2465da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \ 2466da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \ 2467da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2468da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \ 2469da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \ 2470da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \ 2471da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \ 2472da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2473da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \ 2474da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg3_1, stp1_4, stp1_7); \ 2475da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2476da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_8 = stp2_8; \ 2477da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_11 = stp2_11; \ 2478da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_12 = stp2_12; \ 2479da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_15 = stp2_15; \ 2480da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2481da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ 2482da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ 2483da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_18, stp1_29) \ 2484da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ 2485da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ 2486da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_22, stp1_25) \ 2487da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2488da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_16 = stp2_16; \ 2489da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_31 = stp2_31; \ 2490da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_19 = stp2_19; \ 2491da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_20 = stp2_20; \ 2492da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_23 = stp2_23; \ 2493da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_24 = stp2_24; \ 2494da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_27 = stp2_27; \ 2495da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_28 = stp2_28; \ 2496da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} \ 2497da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian\ 2498da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* Stage4 */ \ 2499da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{ \ 2500da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i zero = _mm_setzero_si128();\ 2501da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \ 2502da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \ 2503da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2504da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \ 2505da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \ 2506da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \ 2507da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \ 2508da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2509da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \ 2510da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg4_1, stp2_0, stp2_1); \ 2511da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2512da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_4 = stp1_4; \ 2513da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_5 = stp1_4; \ 2514da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_6 = stp1_7; \ 2515da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_7 = stp1_7; \ 2516da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2517da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ 2518da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ 2519da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_10, stp2_13) \ 2520da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2521da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_8 = stp1_8; \ 2522da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_15 = stp1_15; \ 2523da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_11 = stp1_11; \ 2524da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_12 = stp1_12; \ 2525da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2526da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ 2527da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ 2528da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ 2529da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ 2530da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ 2531da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ 2532da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ 2533da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ 2534da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2535da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ 2536da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ 2537da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ 2538da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ 2539da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ 2540da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ 2541da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ 2542da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ 2543da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} \ 2544da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian\ 2545da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* Stage5 */ \ 2546da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{ \ 2547da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 2548da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 2549da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ 2550da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ 2551da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2552da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ 2553da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ 2554da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ 2555da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ 2556da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2557da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 2558da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 2559da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2560da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_0 = stp2_0; \ 2561da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_1 = stp2_1; \ 2562da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_2 = stp2_1; \ 2563da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_3 = stp2_0; \ 2564da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2565da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 2566da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 2567da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 2568da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 2569da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2570da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_add_epi32(tmp0, rounding); \ 2571da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = _mm_add_epi32(tmp1, rounding); \ 2572da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_add_epi32(tmp2, rounding); \ 2573da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = _mm_add_epi32(tmp3, rounding); \ 2574da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2575da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 2576da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 2577da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 2578da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 2579da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2580da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 2581da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 2582da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2583da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_4 = stp2_4; \ 2584da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_7 = stp2_7; \ 2585da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2586da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ 2587da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 2588da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 2589da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ 2590da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ 2591da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 2592da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 2593da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ 2594da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2595da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_16 = stp2_16; \ 2596da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_17 = stp2_17; \ 2597da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2598da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ 2599da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ 2600da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_19, stp1_28) \ 2601da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ 2602da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ 2603da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_21, stp1_26) \ 2604da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2605da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_22 = stp2_22; \ 2606da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_23 = stp2_23; \ 2607da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_24 = stp2_24; \ 2608da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_25 = stp2_25; \ 2609da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_30 = stp2_30; \ 2610da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_31 = stp2_31; \ 2611da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} \ 2612da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian\ 2613da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* Stage6 */ \ 2614da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{ \ 2615da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 2616da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 2617da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 2618da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 2619da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2620da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ 2621da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 2622da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 2623da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ 2624da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ 2625da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 2626da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 2627da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ 2628da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2629da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_8 = stp1_8; \ 2630da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_9 = stp1_9; \ 2631da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_14 = stp1_14; \ 2632da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_15 = stp1_15; \ 2633da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2634da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ 2635da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ 2636da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_13, stp2_11, stp2_12) \ 2637da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2638da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ 2639da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ 2640da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ 2641da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ 2642da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ 2643da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ 2644da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ 2645da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ 2646da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2647da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ 2648da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ 2649da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ 2650da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ 2651da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ 2652da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ 2653da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ 2654da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ 2655da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} \ 2656da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian\ 2657da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* Stage7 */ \ 2658da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{ \ 2659da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ 2660da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ 2661da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 2662da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 2663da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2664da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ 2665da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ 2666da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ 2667da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ 2668da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2669da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ 2670da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ 2671da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ 2672da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ 2673da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ 2674da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ 2675da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ 2676da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ 2677da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ 2678da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ 2679da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ 2680da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ 2681da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ 2682da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ 2683da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ 2684da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ 2685da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2686da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_16 = stp2_16; \ 2687da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_17 = stp2_17; \ 2688da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_18 = stp2_18; \ 2689da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_19 = stp2_19; \ 2690da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2691da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ 2692da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ 2693da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_21, stp1_26) \ 2694da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ 2695da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ 2696da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_23, stp1_24) \ 2697da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2698da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_28 = stp2_28; \ 2699da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_29 = stp2_29; \ 2700da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_30 = stp2_30; \ 2701da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_31 = stp2_31; \ 2702da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 2703da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2704da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2705da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define IDCT32 \ 2706da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* Stage1 */ \ 2707da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{ \ 2708da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \ 2709da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \ 2710da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \ 2711da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \ 2712da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2713da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \ 2714da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \ 2715da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \ 2716da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \ 2717da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2718da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \ 2719da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \ 2720da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \ 2721da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \ 2722da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2723da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \ 2724da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \ 2725da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \ 2726da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \ 2727da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2728da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ 2729da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \ 2730da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_17, stp1_30) \ 2731da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \ 2732da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \ 2733da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_19, stp1_28) \ 2734da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ 2735da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ 2736da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_21, stp1_26) \ 2737da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ 2738da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ 2739da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_23, stp1_24) \ 2740da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} \ 2741da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian\ 2742da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* Stage2 */ \ 2743da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{ \ 2744da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \ 2745da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \ 2746da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \ 2747da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \ 2748da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2749da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \ 2750da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \ 2751da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \ 2752da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \ 2753da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2754da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ 2755da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ 2756da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_14) \ 2757da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ 2758da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \ 2759da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_11, stp2_12) \ 2760da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2761da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ 2762da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ 2763da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ 2764da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ 2765da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2766da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ 2767da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ 2768da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ 2769da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ 2770da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2771da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ 2772da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ 2773da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ 2774da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ 2775da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2776da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ 2777da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ 2778da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ 2779da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ 2780da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} \ 2781da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian\ 2782da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* Stage3 */ \ 2783da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{ \ 2784da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \ 2785da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \ 2786da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \ 2787da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \ 2788da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2789da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ 2790da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ 2791da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ 2792da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ 2793da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2794da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 2795da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 2796da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ 2797da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ 2798da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2799da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \ 2800da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \ 2801da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_6) \ 2802da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2803da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ 2804da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ 2805da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ 2806da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ 2807da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ 2808da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ 2809da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ 2810da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ 2811da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2812da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ 2813da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ 2814da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_18, stp1_29) \ 2815da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ 2816da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ 2817da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_22, stp1_25) \ 2818da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2819da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_16 = stp2_16; \ 2820da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_31 = stp2_31; \ 2821da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_19 = stp2_19; \ 2822da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_20 = stp2_20; \ 2823da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_23 = stp2_23; \ 2824da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_24 = stp2_24; \ 2825da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_27 = stp2_27; \ 2826da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_28 = stp2_28; \ 2827da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} \ 2828da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian\ 2829da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* Stage4 */ \ 2830da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{ \ 2831da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \ 2832da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \ 2833da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \ 2834da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \ 2835da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2836da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ 2837da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ 2838da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 2839da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 2840da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2841da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \ 2842da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \ 2843da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_2, stp2_3) \ 2844da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2845da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ 2846da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ 2847da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ 2848da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ 2849da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2850da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ 2851da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ 2852da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_10, stp2_13) \ 2853da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2854da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_8 = stp1_8; \ 2855da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_15 = stp1_15; \ 2856da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_11 = stp1_11; \ 2857da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_12 = stp1_12; \ 2858da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2859da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ 2860da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ 2861da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ 2862da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ 2863da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ 2864da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ 2865da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ 2866da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ 2867da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2868da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ 2869da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ 2870da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ 2871da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ 2872da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ 2873da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ 2874da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ 2875da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ 2876da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} \ 2877da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian\ 2878da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* Stage5 */ \ 2879da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{ \ 2880da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 2881da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 2882da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ 2883da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ 2884da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2885da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ 2886da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ 2887da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ 2888da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ 2889da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2890da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 2891da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 2892da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2893da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ 2894da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ 2895da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ 2896da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ 2897da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2898da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 2899da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 2900da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 2901da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 2902da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2903da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_add_epi32(tmp0, rounding); \ 2904da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = _mm_add_epi32(tmp1, rounding); \ 2905da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_add_epi32(tmp2, rounding); \ 2906da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = _mm_add_epi32(tmp3, rounding); \ 2907da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2908da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 2909da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 2910da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 2911da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 2912da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2913da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 2914da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 2915da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2916da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_4 = stp2_4; \ 2917da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_7 = stp2_7; \ 2918da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2919da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ 2920da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 2921da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 2922da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ 2923da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ 2924da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 2925da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 2926da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ 2927da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2928da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_16 = stp2_16; \ 2929da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_17 = stp2_17; \ 2930da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2931da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ 2932da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ 2933da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_19, stp1_28) \ 2934da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ 2935da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ 2936da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_21, stp1_26) \ 2937da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2938da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_22 = stp2_22; \ 2939da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_23 = stp2_23; \ 2940da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_24 = stp2_24; \ 2941da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_25 = stp2_25; \ 2942da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_30 = stp2_30; \ 2943da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_31 = stp2_31; \ 2944da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} \ 2945da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian\ 2946da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* Stage6 */ \ 2947da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{ \ 2948da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 2949da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 2950da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 2951da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 2952da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2953da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ 2954da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 2955da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 2956da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ 2957da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ 2958da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 2959da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 2960da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ 2961da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2962da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_8 = stp1_8; \ 2963da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_9 = stp1_9; \ 2964da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_14 = stp1_14; \ 2965da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_15 = stp1_15; \ 2966da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2967da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ 2968da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ 2969da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_13, stp2_11, stp2_12) \ 2970da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2971da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ 2972da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ 2973da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ 2974da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ 2975da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ 2976da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ 2977da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ 2978da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ 2979da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2980da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ 2981da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ 2982da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ 2983da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ 2984da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ 2985da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ 2986da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ 2987da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ 2988da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} \ 2989da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian\ 2990da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* Stage7 */ \ 2991da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{ \ 2992da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ 2993da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ 2994da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 2995da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 2996da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 2997da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ 2998da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ 2999da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ 3000da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ 3001da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 3002da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ 3003da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ 3004da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ 3005da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ 3006da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ 3007da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ 3008da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ 3009da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ 3010da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ 3011da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ 3012da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ 3013da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ 3014da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ 3015da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ 3016da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ 3017da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ 3018da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 3019da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_16 = stp2_16; \ 3020da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_17 = stp2_17; \ 3021da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_18 = stp2_18; \ 3022da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_19 = stp2_19; \ 3023da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 3024da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ 3025da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ 3026da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_21, stp1_26) \ 3027da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ 3028da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ 3029da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_23, stp1_24) \ 3030da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 3031da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_28 = stp2_28; \ 3032da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_29 = stp2_29; \ 3033da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_30 = stp2_30; \ 3034da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_31 = stp2_31; \ 3035da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 3036da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3037da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian// Only upper-left 8x8 has non-zero coeff 30382263fc984bdc858ee931d3e35c87c404de923950Johannvoid vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, 3039da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int stride) { 3040da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 3041da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i final_rounding = _mm_set1_epi16(1<<5); 3042da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3043da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // idct constants for each stage 3044da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); 3045da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); 3046da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); 3047da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); 3048da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); 3049da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); 3050da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); 3051da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); 3052da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3053da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 3054da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 3055da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 3056da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 3057da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3058da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 3059da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 3060da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); 3061da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); 3062da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); 3063da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); 3064da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); 3065da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); 3066da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3067da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 3068da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 3069da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 3070da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 3071da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 3072da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3073da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 3074da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3075da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i in[32], col[32]; 3076da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 3077da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 3078da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, 3079da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, 3080da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_30, stp1_31; 3081da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 3082da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, 3083da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, 3084da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, 3085da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_30, stp2_31; 3086da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 3087da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int i; 3088da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3089da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Load input data. Only need to load the top left 8x8 block. 30902263fc984bdc858ee931d3e35c87c404de923950Johann in[0] = load_input_data(input); 30912263fc984bdc858ee931d3e35c87c404de923950Johann in[1] = load_input_data(input + 32); 30922263fc984bdc858ee931d3e35c87c404de923950Johann in[2] = load_input_data(input + 64); 30932263fc984bdc858ee931d3e35c87c404de923950Johann in[3] = load_input_data(input + 96); 30942263fc984bdc858ee931d3e35c87c404de923950Johann in[4] = load_input_data(input + 128); 30952263fc984bdc858ee931d3e35c87c404de923950Johann in[5] = load_input_data(input + 160); 30962263fc984bdc858ee931d3e35c87c404de923950Johann in[6] = load_input_data(input + 192); 30972263fc984bdc858ee931d3e35c87c404de923950Johann in[7] = load_input_data(input + 224); 3098da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3099da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 8; i < 32; ++i) { 3100da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[i] = _mm_setzero_si128(); 3101da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3102da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3103da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian array_transpose_8x8(in, in); 3104da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // TODO(hkuang): Following transposes are unnecessary. But remove them will 3105da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // lead to performance drop on some devices. 3106da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian array_transpose_8x8(in + 8, in + 8); 3107da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian array_transpose_8x8(in + 16, in + 16); 3108da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian array_transpose_8x8(in + 24, in + 24); 3109da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3110da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian IDCT32_34 3111da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3112da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 1_D: Store 32 intermediate results for each 8x32 block. 3113da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[0] = _mm_add_epi16(stp1_0, stp1_31); 3114da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[1] = _mm_add_epi16(stp1_1, stp1_30); 3115da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[2] = _mm_add_epi16(stp1_2, stp1_29); 3116da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[3] = _mm_add_epi16(stp1_3, stp1_28); 3117da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[4] = _mm_add_epi16(stp1_4, stp1_27); 3118da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[5] = _mm_add_epi16(stp1_5, stp1_26); 3119da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[6] = _mm_add_epi16(stp1_6, stp1_25); 3120da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[7] = _mm_add_epi16(stp1_7, stp1_24); 3121da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[8] = _mm_add_epi16(stp1_8, stp1_23); 3122da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[9] = _mm_add_epi16(stp1_9, stp1_22); 3123da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[10] = _mm_add_epi16(stp1_10, stp1_21); 3124da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[11] = _mm_add_epi16(stp1_11, stp1_20); 3125da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[12] = _mm_add_epi16(stp1_12, stp1_19); 3126da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[13] = _mm_add_epi16(stp1_13, stp1_18); 3127da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[14] = _mm_add_epi16(stp1_14, stp1_17); 3128da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[15] = _mm_add_epi16(stp1_15, stp1_16); 3129da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[16] = _mm_sub_epi16(stp1_15, stp1_16); 3130da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[17] = _mm_sub_epi16(stp1_14, stp1_17); 3131da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[18] = _mm_sub_epi16(stp1_13, stp1_18); 3132da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[19] = _mm_sub_epi16(stp1_12, stp1_19); 3133da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[20] = _mm_sub_epi16(stp1_11, stp1_20); 3134da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[21] = _mm_sub_epi16(stp1_10, stp1_21); 3135da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[22] = _mm_sub_epi16(stp1_9, stp1_22); 3136da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[23] = _mm_sub_epi16(stp1_8, stp1_23); 3137da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[24] = _mm_sub_epi16(stp1_7, stp1_24); 3138da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[25] = _mm_sub_epi16(stp1_6, stp1_25); 3139da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[26] = _mm_sub_epi16(stp1_5, stp1_26); 3140da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[27] = _mm_sub_epi16(stp1_4, stp1_27); 3141da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[28] = _mm_sub_epi16(stp1_3, stp1_28); 3142da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[29] = _mm_sub_epi16(stp1_2, stp1_29); 3143da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[30] = _mm_sub_epi16(stp1_1, stp1_30); 3144da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[31] = _mm_sub_epi16(stp1_0, stp1_31); 3145da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 4; i++) { 3146da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int j; 3147da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i zero = _mm_setzero_si128(); 3148da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Transpose 32x8 block to 8x32 block 3149da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian array_transpose_8x8(col + i * 8, in); 3150da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian IDCT32_34 3151da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3152da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 2_D: Calculate the results and store them to destination. 3153da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[0] = _mm_add_epi16(stp1_0, stp1_31); 3154da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[1] = _mm_add_epi16(stp1_1, stp1_30); 3155da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[2] = _mm_add_epi16(stp1_2, stp1_29); 3156da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[3] = _mm_add_epi16(stp1_3, stp1_28); 3157da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[4] = _mm_add_epi16(stp1_4, stp1_27); 3158da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[5] = _mm_add_epi16(stp1_5, stp1_26); 3159da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[6] = _mm_add_epi16(stp1_6, stp1_25); 3160da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[7] = _mm_add_epi16(stp1_7, stp1_24); 3161da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[8] = _mm_add_epi16(stp1_8, stp1_23); 3162da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[9] = _mm_add_epi16(stp1_9, stp1_22); 3163da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[10] = _mm_add_epi16(stp1_10, stp1_21); 3164da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[11] = _mm_add_epi16(stp1_11, stp1_20); 3165da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[12] = _mm_add_epi16(stp1_12, stp1_19); 3166da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[13] = _mm_add_epi16(stp1_13, stp1_18); 3167da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[14] = _mm_add_epi16(stp1_14, stp1_17); 3168da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[15] = _mm_add_epi16(stp1_15, stp1_16); 3169da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[16] = _mm_sub_epi16(stp1_15, stp1_16); 3170da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[17] = _mm_sub_epi16(stp1_14, stp1_17); 3171da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[18] = _mm_sub_epi16(stp1_13, stp1_18); 3172da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[19] = _mm_sub_epi16(stp1_12, stp1_19); 3173da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[20] = _mm_sub_epi16(stp1_11, stp1_20); 3174da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[21] = _mm_sub_epi16(stp1_10, stp1_21); 3175da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[22] = _mm_sub_epi16(stp1_9, stp1_22); 3176da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[23] = _mm_sub_epi16(stp1_8, stp1_23); 3177da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[24] = _mm_sub_epi16(stp1_7, stp1_24); 3178da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[25] = _mm_sub_epi16(stp1_6, stp1_25); 3179da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[26] = _mm_sub_epi16(stp1_5, stp1_26); 3180da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[27] = _mm_sub_epi16(stp1_4, stp1_27); 3181da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[28] = _mm_sub_epi16(stp1_3, stp1_28); 3182da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[29] = _mm_sub_epi16(stp1_2, stp1_29); 3183da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[30] = _mm_sub_epi16(stp1_1, stp1_30); 3184da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[31] = _mm_sub_epi16(stp1_0, stp1_31); 3185da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3186da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (j = 0; j < 32; ++j) { 3187da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Final rounding and shift 3188da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[j] = _mm_adds_epi16(in[j], final_rounding); 3189da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[j] = _mm_srai_epi16(in[j], 6); 3190da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + j * stride, in[j]); 3191da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3192da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3193da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dest += 8; 3194da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3195da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 3196da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 31972263fc984bdc858ee931d3e35c87c404de923950Johannvoid vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, 3198da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int stride) { 3199da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 3200da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i final_rounding = _mm_set1_epi16(1 << 5); 3201da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i zero = _mm_setzero_si128(); 3202da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3203da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // idct constants for each stage 3204da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); 3205da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); 3206da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); 3207da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); 3208da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); 3209da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); 3210da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); 3211da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); 3212da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); 3213da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); 3214da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); 3215da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); 3216da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); 3217da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); 3218da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); 3219da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); 3220da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3221da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 3222da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 3223da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 3224da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 3225da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 3226da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 3227da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 3228da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 3229da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3230da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 3231da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 3232da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 3233da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 3234da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); 3235da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); 3236da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); 3237da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); 3238da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); 3239da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); 3240da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3241da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 3242da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 3243da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 3244da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 3245da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 3246da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 3247da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 3248da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3249da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 3250da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3251da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i in[32], col[128], zero_idx[16]; 3252da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 3253da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 3254da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, 3255da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, 3256da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp1_30, stp1_31; 3257da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 3258da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, 3259da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, 3260da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, 3261da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian stp2_30, stp2_31; 3262da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 3263da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int i, j, i32; 3264da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3265da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 4; i++) { 3266da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian i32 = (i << 5); 3267da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // First 1-D idct 3268da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Load input data. 3269da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[0], input); 3270da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[8], input); 3271da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[16], input); 3272da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[24], input); 3273da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[1], input); 3274da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[9], input); 3275da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[17], input); 3276da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[25], input); 3277da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[2], input); 3278da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[10], input); 3279da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[18], input); 3280da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[26], input); 3281da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[3], input); 3282da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[11], input); 3283da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[19], input); 3284da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[27], input); 3285da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3286da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[4], input); 3287da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[12], input); 3288da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[20], input); 3289da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[28], input); 3290da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[5], input); 3291da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[13], input); 3292da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[21], input); 3293da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[29], input); 3294da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[6], input); 3295da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[14], input); 3296da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[22], input); 3297da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[30], input); 3298da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[7], input); 3299da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[15], input); 3300da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[23], input); 3301da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LOAD_DQCOEFF(in[31], input); 3302da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3303da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // checking if all entries are zero 3304da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_idx[0] = _mm_or_si128(in[0], in[1]); 3305da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_idx[1] = _mm_or_si128(in[2], in[3]); 3306da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_idx[2] = _mm_or_si128(in[4], in[5]); 3307da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_idx[3] = _mm_or_si128(in[6], in[7]); 3308da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_idx[4] = _mm_or_si128(in[8], in[9]); 3309da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_idx[5] = _mm_or_si128(in[10], in[11]); 3310da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_idx[6] = _mm_or_si128(in[12], in[13]); 3311da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_idx[7] = _mm_or_si128(in[14], in[15]); 3312da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_idx[8] = _mm_or_si128(in[16], in[17]); 3313da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_idx[9] = _mm_or_si128(in[18], in[19]); 3314da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_idx[10] = _mm_or_si128(in[20], in[21]); 3315da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_idx[11] = _mm_or_si128(in[22], in[23]); 3316da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_idx[12] = _mm_or_si128(in[24], in[25]); 3317da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_idx[13] = _mm_or_si128(in[26], in[27]); 3318da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_idx[14] = _mm_or_si128(in[28], in[29]); 3319da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_idx[15] = _mm_or_si128(in[30], in[31]); 3320da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3321da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); 3322da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); 3323da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); 3324da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); 3325da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); 3326da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); 3327da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); 3328da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); 3329da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3330da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); 3331da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); 3332da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); 3333da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); 3334da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); 3335da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); 3336da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); 3337da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3338da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) { 3339da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 0] = _mm_setzero_si128(); 3340da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 1] = _mm_setzero_si128(); 3341da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 2] = _mm_setzero_si128(); 3342da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 3] = _mm_setzero_si128(); 3343da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 4] = _mm_setzero_si128(); 3344da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 5] = _mm_setzero_si128(); 3345da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 6] = _mm_setzero_si128(); 3346da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 7] = _mm_setzero_si128(); 3347da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 8] = _mm_setzero_si128(); 3348da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 9] = _mm_setzero_si128(); 3349da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 10] = _mm_setzero_si128(); 3350da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 11] = _mm_setzero_si128(); 3351da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 12] = _mm_setzero_si128(); 3352da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 13] = _mm_setzero_si128(); 3353da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 14] = _mm_setzero_si128(); 3354da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 15] = _mm_setzero_si128(); 3355da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 16] = _mm_setzero_si128(); 3356da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 17] = _mm_setzero_si128(); 3357da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 18] = _mm_setzero_si128(); 3358da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 19] = _mm_setzero_si128(); 3359da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 20] = _mm_setzero_si128(); 3360da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 21] = _mm_setzero_si128(); 3361da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 22] = _mm_setzero_si128(); 3362da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 23] = _mm_setzero_si128(); 3363da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 24] = _mm_setzero_si128(); 3364da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 25] = _mm_setzero_si128(); 3365da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 26] = _mm_setzero_si128(); 3366da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 27] = _mm_setzero_si128(); 3367da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 28] = _mm_setzero_si128(); 3368da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 29] = _mm_setzero_si128(); 3369da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 30] = _mm_setzero_si128(); 3370da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 31] = _mm_setzero_si128(); 3371da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian continue; 3372da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3373da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3374da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Transpose 32x8 block to 8x32 block 3375da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian array_transpose_8x8(in, in); 3376da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian array_transpose_8x8(in + 8, in + 8); 3377da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian array_transpose_8x8(in + 16, in + 16); 3378da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian array_transpose_8x8(in + 24, in + 24); 3379da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3380da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian IDCT32 3381da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3382da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 1_D: Store 32 intermediate results for each 8x32 block. 3383da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); 3384da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); 3385da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); 3386da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); 3387da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); 3388da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); 3389da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); 3390da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); 3391da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); 3392da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); 3393da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); 3394da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); 3395da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); 3396da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); 3397da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); 3398da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); 3399da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); 3400da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); 3401da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); 3402da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); 3403da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); 3404da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); 3405da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); 3406da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); 3407da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); 3408da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); 3409da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); 3410da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); 3411da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); 3412da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); 3413da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); 3414da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); 3415da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3416da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 4; i++) { 3417da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Second 1-D idct 3418da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian j = i << 3; 3419da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3420da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Transpose 32x8 block to 8x32 block 3421da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian array_transpose_8x8(col + j, in); 3422da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian array_transpose_8x8(col + j + 32, in + 8); 3423da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian array_transpose_8x8(col + j + 64, in + 16); 3424da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian array_transpose_8x8(col + j + 96, in + 24); 3425da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3426da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian IDCT32 3427da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3428da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 2_D: Calculate the results and store them to destination. 3429da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[0] = _mm_add_epi16(stp1_0, stp1_31); 3430da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[1] = _mm_add_epi16(stp1_1, stp1_30); 3431da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[2] = _mm_add_epi16(stp1_2, stp1_29); 3432da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[3] = _mm_add_epi16(stp1_3, stp1_28); 3433da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[4] = _mm_add_epi16(stp1_4, stp1_27); 3434da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[5] = _mm_add_epi16(stp1_5, stp1_26); 3435da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[6] = _mm_add_epi16(stp1_6, stp1_25); 3436da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[7] = _mm_add_epi16(stp1_7, stp1_24); 3437da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[8] = _mm_add_epi16(stp1_8, stp1_23); 3438da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[9] = _mm_add_epi16(stp1_9, stp1_22); 3439da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[10] = _mm_add_epi16(stp1_10, stp1_21); 3440da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[11] = _mm_add_epi16(stp1_11, stp1_20); 3441da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[12] = _mm_add_epi16(stp1_12, stp1_19); 3442da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[13] = _mm_add_epi16(stp1_13, stp1_18); 3443da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[14] = _mm_add_epi16(stp1_14, stp1_17); 3444da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[15] = _mm_add_epi16(stp1_15, stp1_16); 3445da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[16] = _mm_sub_epi16(stp1_15, stp1_16); 3446da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[17] = _mm_sub_epi16(stp1_14, stp1_17); 3447da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[18] = _mm_sub_epi16(stp1_13, stp1_18); 3448da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[19] = _mm_sub_epi16(stp1_12, stp1_19); 3449da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[20] = _mm_sub_epi16(stp1_11, stp1_20); 3450da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[21] = _mm_sub_epi16(stp1_10, stp1_21); 3451da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[22] = _mm_sub_epi16(stp1_9, stp1_22); 3452da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[23] = _mm_sub_epi16(stp1_8, stp1_23); 3453da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[24] = _mm_sub_epi16(stp1_7, stp1_24); 3454da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[25] = _mm_sub_epi16(stp1_6, stp1_25); 3455da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[26] = _mm_sub_epi16(stp1_5, stp1_26); 3456da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[27] = _mm_sub_epi16(stp1_4, stp1_27); 3457da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[28] = _mm_sub_epi16(stp1_3, stp1_28); 3458da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[29] = _mm_sub_epi16(stp1_2, stp1_29); 3459da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[30] = _mm_sub_epi16(stp1_1, stp1_30); 3460da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[31] = _mm_sub_epi16(stp1_0, stp1_31); 3461da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3462da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (j = 0; j < 32; ++j) { 3463da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Final rounding and shift 3464da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[j] = _mm_adds_epi16(in[j], final_rounding); 3465da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[j] = _mm_srai_epi16(in[j], 6); 3466da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RECON_AND_STORE(dest + j * stride, in[j]); 3467da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3468da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3469da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dest += 8; 3470da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3471da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 3472da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 34732263fc984bdc858ee931d3e35c87c404de923950Johannvoid vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, 34742263fc984bdc858ee931d3e35c87c404de923950Johann int stride) { 3475da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i dc_value; 3476da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i zero = _mm_setzero_si128(); 34772263fc984bdc858ee931d3e35c87c404de923950Johann int a, j; 3478da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3479da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian a = dct_const_round_shift(input[0] * cospi_16_64); 3480da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian a = dct_const_round_shift(a * cospi_16_64); 3481da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian a = ROUND_POWER_OF_TWO(a, 6); 3482da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3483da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dc_value = _mm_set1_epi16(a); 3484da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 34852263fc984bdc858ee931d3e35c87c404de923950Johann for (j = 0; j < 32; ++j) { 34862263fc984bdc858ee931d3e35c87c404de923950Johann RECON_AND_STORE(dest + 0 + j * stride, dc_value); 34872263fc984bdc858ee931d3e35c87c404de923950Johann RECON_AND_STORE(dest + 8 + j * stride, dc_value); 34882263fc984bdc858ee931d3e35c87c404de923950Johann RECON_AND_STORE(dest + 16 + j * stride, dc_value); 34892263fc984bdc858ee931d3e35c87c404de923950Johann RECON_AND_STORE(dest + 24 + j * stride, dc_value); 3490da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3491da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 3492da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3493da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#if CONFIG_VP9_HIGHBITDEPTH 3494da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic INLINE __m128i clamp_high_sse2(__m128i value, int bd) { 3495da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i ubounded, retval; 3496da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i zero = _mm_set1_epi16(0); 3497da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i one = _mm_set1_epi16(1); 3498da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one); 3499da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ubounded = _mm_cmpgt_epi16(value, max); 3500da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian retval = _mm_andnot_si128(ubounded, value); 3501da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ubounded = _mm_and_si128(ubounded, max); 3502da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian retval = _mm_or_si128(retval, ubounded); 3503da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero)); 3504da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return retval; 3505da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 3506da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3507da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, 3508da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int stride, int bd) { 3509da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tran_low_t out[4 * 4]; 3510da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tran_low_t *outptr = out; 3511da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int i, j; 3512da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i inptr[4]; 3513da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i sign_bits[2]; 3514da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i temp_mm, min_input, max_input; 3515da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int test; 3516da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 3517da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int optimised_cols = 0; 3518da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i zero = _mm_set1_epi16(0); 3519da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i eight = _mm_set1_epi16(8); 3520da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i max = _mm_set1_epi16(12043); 3521da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i min = _mm_set1_epi16(-12043); 3522da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Load input into __m128i 3523da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[0] = _mm_loadu_si128((const __m128i *)input); 3524da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4)); 3525da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8)); 3526da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12)); 3527da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3528da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Pack to 16 bits 3529da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]); 3530da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]); 3531da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3532da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian max_input = _mm_max_epi16(inptr[0], inptr[1]); 3533da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian min_input = _mm_min_epi16(inptr[0], inptr[1]); 3534da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian max_input = _mm_cmpgt_epi16(max_input, max); 3535da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian min_input = _mm_cmplt_epi16(min_input, min); 3536da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp_mm = _mm_or_si128(max_input, min_input); 3537da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian test = _mm_movemask_epi8(temp_mm); 3538da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3539da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian if (!test) { 3540da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Do the row transform 3541da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct4_sse2(inptr); 3542da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3543da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Check the min & max values 3544da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian max_input = _mm_max_epi16(inptr[0], inptr[1]); 3545da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian min_input = _mm_min_epi16(inptr[0], inptr[1]); 3546da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian max_input = _mm_cmpgt_epi16(max_input, max); 3547da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian min_input = _mm_cmplt_epi16(min_input, min); 3548da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp_mm = _mm_or_si128(max_input, min_input); 3549da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian test = _mm_movemask_epi8(temp_mm); 3550da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3551da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian if (test) { 3552da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian transpose_4x4(inptr); 3553da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero); 3554da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero); 3555da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]); 3556da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]); 3557da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]); 3558da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]); 3559da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)outptr, inptr[0]); 3560da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]); 3561da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]); 3562da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]); 3563da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } else { 3564da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Set to use the optimised transform for the column 3565da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian optimised_cols = 1; 3566da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3567da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } else { 3568da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Run the un-optimised row transform 3569da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 4; ++i) { 3570da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vpx_highbd_idct4_c(input, outptr, bd); 3571da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input += 4; 3572da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian outptr += 4; 3573da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3574da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3575da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3576da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian if (optimised_cols) { 3577da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct4_sse2(inptr); 3578da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3579da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Final round and shift 3580da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[0] = _mm_add_epi16(inptr[0], eight); 3581da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[1] = _mm_add_epi16(inptr[1], eight); 3582da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3583da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[0] = _mm_srai_epi16(inptr[0], 4); 3584da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[1] = _mm_srai_epi16(inptr[1], 4); 3585da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3586da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Reconstruction and Store 3587da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { 3588da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i d0 = _mm_loadl_epi64((const __m128i *)dest); 3589da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2)); 3590da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d0 = _mm_unpacklo_epi64( 3591da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d0, _mm_loadl_epi64((const __m128i *)(dest + stride))); 3592da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d2 = _mm_unpacklo_epi64( 3593da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3))); 3594da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd); 3595da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd); 3596da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // store input0 3597da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian _mm_storel_epi64((__m128i *)dest, d0); 3598da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // store input1 3599da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d0 = _mm_srli_si128(d0, 8); 3600da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian _mm_storel_epi64((__m128i *)(dest + stride), d0); 3601da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // store input2 3602da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian _mm_storel_epi64((__m128i *)(dest + stride * 2), d2); 3603da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // store input3 3604da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d2 = _mm_srli_si128(d2, 8); 3605da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian _mm_storel_epi64((__m128i *)(dest + stride * 3), d2); 3606da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3607da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } else { 3608da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Run the un-optimised column transform 3609da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tran_low_t temp_in[4], temp_out[4]; 3610da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Columns 3611da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 4; ++i) { 3612da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (j = 0; j < 4; ++j) 3613da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp_in[j] = out[j * 4 + i]; 3614da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vpx_highbd_idct4_c(temp_in, temp_out, bd); 3615da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (j = 0; j < 4; ++j) { 3616da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dest[j * stride + i] = highbd_clip_pixel_add( 3617da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); 3618da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3619da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3620da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3621da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 3622da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3623da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8, 3624da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int stride, int bd) { 3625da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tran_low_t out[8 * 8]; 3626da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tran_low_t *outptr = out; 3627da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int i, j, test; 3628da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i inptr[8]; 3629da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i min_input, max_input, temp1, temp2, sign_bits; 3630da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 3631da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i zero = _mm_set1_epi16(0); 3632da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i sixteen = _mm_set1_epi16(16); 3633da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i max = _mm_set1_epi16(6201); 3634da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i min = _mm_set1_epi16(-6201); 3635da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int optimised_cols = 0; 3636da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3637da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Load input into __m128i & pack to 16 bits 3638da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 8; i++) { 3639da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); 3640da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); 3641da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[i] = _mm_packs_epi32(temp1, temp2); 3642da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3643da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3644da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Find the min & max for the row transform 3645da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian max_input = _mm_max_epi16(inptr[0], inptr[1]); 3646da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian min_input = _mm_min_epi16(inptr[0], inptr[1]); 3647da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 2; i < 8; i++) { 3648da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian max_input = _mm_max_epi16(max_input, inptr[i]); 3649da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian min_input = _mm_min_epi16(min_input, inptr[i]); 3650da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3651da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian max_input = _mm_cmpgt_epi16(max_input, max); 3652da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian min_input = _mm_cmplt_epi16(min_input, min); 3653da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp1 = _mm_or_si128(max_input, min_input); 3654da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian test = _mm_movemask_epi8(temp1); 3655da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3656da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian if (!test) { 3657da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Do the row transform 3658da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct8_sse2(inptr); 3659da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3660da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Find the min & max for the column transform 3661da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian max_input = _mm_max_epi16(inptr[0], inptr[1]); 3662da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian min_input = _mm_min_epi16(inptr[0], inptr[1]); 3663da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 2; i < 8; i++) { 3664da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian max_input = _mm_max_epi16(max_input, inptr[i]); 3665da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian min_input = _mm_min_epi16(min_input, inptr[i]); 3666da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3667da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian max_input = _mm_cmpgt_epi16(max_input, max); 3668da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian min_input = _mm_cmplt_epi16(min_input, min); 3669da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp1 = _mm_or_si128(max_input, min_input); 3670da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian test = _mm_movemask_epi8(temp1); 3671da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3672da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian if (test) { 3673da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian array_transpose_8x8(inptr, inptr); 3674da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 8; i++) { 3675da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sign_bits = _mm_cmplt_epi16(inptr[i], zero); 3676da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); 3677da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); 3678da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); 3679da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); 3680da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3681da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } else { 3682da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Set to use the optimised transform for the column 3683da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian optimised_cols = 1; 3684da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3685da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } else { 3686da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Run the un-optimised row transform 3687da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 8; ++i) { 3688da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vpx_highbd_idct8_c(input, outptr, bd); 3689da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input += 8; 3690da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian outptr += 8; 3691da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3692da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3693da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3694da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian if (optimised_cols) { 3695da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct8_sse2(inptr); 3696da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3697da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Final round & shift and Reconstruction and Store 3698da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { 3699da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i d[8]; 3700da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 8; i++) { 3701da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[i] = _mm_add_epi16(inptr[i], sixteen); 3702da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); 3703da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[i] = _mm_srai_epi16(inptr[i], 5); 3704da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); 3705da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Store 3706da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]); 3707da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3708da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3709da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } else { 3710da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Run the un-optimised column transform 3711da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tran_low_t temp_in[8], temp_out[8]; 3712da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 8; ++i) { 3713da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (j = 0; j < 8; ++j) 3714da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp_in[j] = out[j * 8 + i]; 3715da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vpx_highbd_idct8_c(temp_in, temp_out, bd); 3716da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (j = 0; j < 8; ++j) { 3717da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dest[j * stride + i] = highbd_clip_pixel_add( 3718da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); 3719da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3720da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3721da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3722da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 3723da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3724da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8, 3725da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int stride, int bd) { 3726da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tran_low_t out[8 * 8] = { 0 }; 3727da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tran_low_t *outptr = out; 3728da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int i, j, test; 3729da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i inptr[8]; 3730da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i min_input, max_input, temp1, temp2, sign_bits; 3731da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 3732da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i zero = _mm_set1_epi16(0); 3733da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i sixteen = _mm_set1_epi16(16); 3734da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i max = _mm_set1_epi16(6201); 3735da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i min = _mm_set1_epi16(-6201); 3736da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int optimised_cols = 0; 3737da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3738da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Load input into __m128i & pack to 16 bits 3739da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 8; i++) { 3740da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); 3741da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); 3742da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[i] = _mm_packs_epi32(temp1, temp2); 3743da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3744da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3745da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Find the min & max for the row transform 3746da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // only first 4 row has non-zero coefs 3747da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian max_input = _mm_max_epi16(inptr[0], inptr[1]); 3748da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian min_input = _mm_min_epi16(inptr[0], inptr[1]); 3749da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 2; i < 4; i++) { 3750da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian max_input = _mm_max_epi16(max_input, inptr[i]); 3751da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian min_input = _mm_min_epi16(min_input, inptr[i]); 3752da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3753da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian max_input = _mm_cmpgt_epi16(max_input, max); 3754da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian min_input = _mm_cmplt_epi16(min_input, min); 3755da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp1 = _mm_or_si128(max_input, min_input); 3756da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian test = _mm_movemask_epi8(temp1); 3757da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3758da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian if (!test) { 3759da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Do the row transform 3760da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct8_sse2(inptr); 3761da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3762da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Find the min & max for the column transform 3763da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // N.B. Only first 4 cols contain non-zero coeffs 3764da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian max_input = _mm_max_epi16(inptr[0], inptr[1]); 3765da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian min_input = _mm_min_epi16(inptr[0], inptr[1]); 3766da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 2; i < 8; i++) { 3767da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian max_input = _mm_max_epi16(max_input, inptr[i]); 3768da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian min_input = _mm_min_epi16(min_input, inptr[i]); 3769da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3770da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian max_input = _mm_cmpgt_epi16(max_input, max); 3771da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian min_input = _mm_cmplt_epi16(min_input, min); 3772da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp1 = _mm_or_si128(max_input, min_input); 3773da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian test = _mm_movemask_epi8(temp1); 3774da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3775da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian if (test) { 3776da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Use fact only first 4 rows contain non-zero coeffs 3777da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian array_transpose_4X8(inptr, inptr); 3778da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 4; i++) { 3779da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sign_bits = _mm_cmplt_epi16(inptr[i], zero); 3780da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); 3781da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); 3782da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); 3783da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); 3784da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3785da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } else { 3786da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Set to use the optimised transform for the column 3787da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian optimised_cols = 1; 3788da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3789da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } else { 3790da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Run the un-optimised row transform 3791da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 4; ++i) { 3792da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vpx_highbd_idct8_c(input, outptr, bd); 3793da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input += 8; 3794da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian outptr += 8; 3795da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3796da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3797da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3798da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian if (optimised_cols) { 3799da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct8_sse2(inptr); 3800da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3801da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Final round & shift and Reconstruction and Store 3802da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { 3803da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i d[8]; 3804da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 8; i++) { 3805da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[i] = _mm_add_epi16(inptr[i], sixteen); 3806da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); 3807da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[i] = _mm_srai_epi16(inptr[i], 5); 3808da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); 3809da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Store 3810da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]); 3811da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3812da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3813da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } else { 3814da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Run the un-optimised column transform 3815da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tran_low_t temp_in[8], temp_out[8]; 3816da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 8; ++i) { 3817da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (j = 0; j < 8; ++j) 3818da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp_in[j] = out[j * 8 + i]; 3819da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vpx_highbd_idct8_c(temp_in, temp_out, bd); 3820da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (j = 0; j < 8; ++j) { 3821da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dest[j * stride + i] = highbd_clip_pixel_add( 3822da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); 3823da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3824da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3825da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3826da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 3827da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3828da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8, 3829da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int stride, int bd) { 3830da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tran_low_t out[16 * 16]; 3831da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tran_low_t *outptr = out; 3832da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int i, j, test; 3833da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i inptr[32]; 3834da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i min_input, max_input, temp1, temp2, sign_bits; 3835da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 3836da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i zero = _mm_set1_epi16(0); 3837da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i rounding = _mm_set1_epi16(32); 3838da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i max = _mm_set1_epi16(3155); 3839da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i min = _mm_set1_epi16(-3155); 3840da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int optimised_cols = 0; 3841da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3842da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Load input into __m128i & pack to 16 bits 3843da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 16; i++) { 3844da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); 3845da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); 3846da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[i] = _mm_packs_epi32(temp1, temp2); 3847da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); 3848da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); 3849da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[i + 16] = _mm_packs_epi32(temp1, temp2); 3850da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3851da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3852da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Find the min & max for the row transform 3853da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian max_input = _mm_max_epi16(inptr[0], inptr[1]); 3854da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian min_input = _mm_min_epi16(inptr[0], inptr[1]); 3855da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 2; i < 32; i++) { 3856da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian max_input = _mm_max_epi16(max_input, inptr[i]); 3857da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian min_input = _mm_min_epi16(min_input, inptr[i]); 3858da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3859da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian max_input = _mm_cmpgt_epi16(max_input, max); 3860da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian min_input = _mm_cmplt_epi16(min_input, min); 3861da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp1 = _mm_or_si128(max_input, min_input); 3862da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian test = _mm_movemask_epi8(temp1); 3863da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3864da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian if (!test) { 3865da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Do the row transform 3866da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct16_sse2(inptr, inptr + 16); 3867da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3868da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Find the min & max for the column transform 3869da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian max_input = _mm_max_epi16(inptr[0], inptr[1]); 3870da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian min_input = _mm_min_epi16(inptr[0], inptr[1]); 3871da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 2; i < 32; i++) { 3872da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian max_input = _mm_max_epi16(max_input, inptr[i]); 3873da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian min_input = _mm_min_epi16(min_input, inptr[i]); 3874da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3875da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian max_input = _mm_cmpgt_epi16(max_input, max); 3876da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian min_input = _mm_cmplt_epi16(min_input, min); 3877da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp1 = _mm_or_si128(max_input, min_input); 3878da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian test = _mm_movemask_epi8(temp1); 3879da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3880da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian if (test) { 3881da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian array_transpose_16x16(inptr, inptr + 16); 3882da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 16; i++) { 3883da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sign_bits = _mm_cmplt_epi16(inptr[i], zero); 3884da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); 3885da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); 3886da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); 3887da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); 3888da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); 3889da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); 3890da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); 3891da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); 3892da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); 3893da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3894da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } else { 3895da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Set to use the optimised transform for the column 3896da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian optimised_cols = 1; 3897da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3898da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } else { 3899da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Run the un-optimised row transform 3900da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 16; ++i) { 3901da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vpx_highbd_idct16_c(input, outptr, bd); 3902da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input += 16; 3903da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian outptr += 16; 3904da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3905da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3906da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3907da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian if (optimised_cols) { 3908da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct16_sse2(inptr, inptr + 16); 3909da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3910da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Final round & shift and Reconstruction and Store 3911da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { 3912da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i d[2]; 3913da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 16; i++) { 3914da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[i ] = _mm_add_epi16(inptr[i ], rounding); 3915da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding); 3916da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); 3917da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8)); 3918da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[i ] = _mm_srai_epi16(inptr[i ], 6); 3919da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6); 3920da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd); 3921da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd); 3922da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Store 3923da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]); 3924da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]); 3925da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3926da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3927da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } else { 3928da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Run the un-optimised column transform 3929da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tran_low_t temp_in[16], temp_out[16]; 3930da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 16; ++i) { 3931da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (j = 0; j < 16; ++j) 3932da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp_in[j] = out[j * 16 + i]; 3933da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vpx_highbd_idct16_c(temp_in, temp_out, bd); 3934da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (j = 0; j < 16; ++j) { 3935da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dest[j * stride + i] = highbd_clip_pixel_add( 3936da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 3937da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3938da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3939da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3940da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 3941da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3942da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8, 3943da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int stride, int bd) { 3944da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tran_low_t out[16 * 16] = { 0 }; 3945da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tran_low_t *outptr = out; 3946da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int i, j, test; 3947da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i inptr[32]; 3948da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i min_input, max_input, temp1, temp2, sign_bits; 3949da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 3950da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i zero = _mm_set1_epi16(0); 3951da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i rounding = _mm_set1_epi16(32); 3952da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i max = _mm_set1_epi16(3155); 3953da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i min = _mm_set1_epi16(-3155); 3954da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int optimised_cols = 0; 3955da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3956da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Load input into __m128i & pack to 16 bits 3957da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 16; i++) { 3958da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); 3959da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); 3960da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[i] = _mm_packs_epi32(temp1, temp2); 3961da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); 3962da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); 3963da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[i + 16] = _mm_packs_epi32(temp1, temp2); 3964da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3965da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3966da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Find the min & max for the row transform 3967da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Since all non-zero dct coefficients are in upper-left 4x4 area, 3968da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // we only need to consider first 4 rows here. 3969da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian max_input = _mm_max_epi16(inptr[0], inptr[1]); 3970da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian min_input = _mm_min_epi16(inptr[0], inptr[1]); 3971da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 2; i < 4; i++) { 3972da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian max_input = _mm_max_epi16(max_input, inptr[i]); 3973da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian min_input = _mm_min_epi16(min_input, inptr[i]); 3974da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3975da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian max_input = _mm_cmpgt_epi16(max_input, max); 3976da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian min_input = _mm_cmplt_epi16(min_input, min); 3977da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp1 = _mm_or_si128(max_input, min_input); 3978da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian test = _mm_movemask_epi8(temp1); 3979da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3980da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian if (!test) { 3981da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Do the row transform (N.B. This transposes inptr) 3982da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct16_sse2(inptr, inptr + 16); 3983da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3984da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Find the min & max for the column transform 3985da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // N.B. Only first 4 cols contain non-zero coeffs 3986da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian max_input = _mm_max_epi16(inptr[0], inptr[1]); 3987da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian min_input = _mm_min_epi16(inptr[0], inptr[1]); 3988da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 2; i < 16; i++) { 3989da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian max_input = _mm_max_epi16(max_input, inptr[i]); 3990da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian min_input = _mm_min_epi16(min_input, inptr[i]); 3991da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 3992da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian max_input = _mm_cmpgt_epi16(max_input, max); 3993da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian min_input = _mm_cmplt_epi16(min_input, min); 3994da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp1 = _mm_or_si128(max_input, min_input); 3995da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian test = _mm_movemask_epi8(temp1); 3996da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3997da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian if (test) { 3998da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Use fact only first 4 rows contain non-zero coeffs 3999da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian array_transpose_8x8(inptr, inptr); 4000da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian array_transpose_8x8(inptr + 8, inptr + 16); 4001da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 4; i++) { 4002da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sign_bits = _mm_cmplt_epi16(inptr[i], zero); 4003da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); 4004da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); 4005da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); 4006da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); 4007da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); 4008da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); 4009da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); 4010da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); 4011da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); 4012da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 4013da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } else { 4014da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Set to use the optimised transform for the column 4015da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian optimised_cols = 1; 4016da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 4017da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } else { 4018da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Run the un-optimised row transform 4019da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 4; ++i) { 4020da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vpx_highbd_idct16_c(input, outptr, bd); 4021da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian input += 16; 4022da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian outptr += 16; 4023da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 4024da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 4025da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 4026da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian if (optimised_cols) { 4027da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct16_sse2(inptr, inptr + 16); 4028da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 4029da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Final round & shift and Reconstruction and Store 4030da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { 4031da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i d[2]; 4032da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 16; i++) { 4033da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[i ] = _mm_add_epi16(inptr[i ], rounding); 4034da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding); 4035da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); 4036da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8)); 4037da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[i ] = _mm_srai_epi16(inptr[i ], 6); 4038da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6); 4039da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd); 4040da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd); 4041da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Store 4042da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]); 4043da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]); 4044da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 4045da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 4046da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } else { 4047da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Run the un-optimised column transform 4048da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tran_low_t temp_in[16], temp_out[16]; 4049da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 16; ++i) { 4050da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (j = 0; j < 16; ++j) 4051da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian temp_in[j] = out[j * 16 + i]; 4052da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vpx_highbd_idct16_c(temp_in, temp_out, bd); 4053da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (j = 0; j < 16; ++j) { 4054da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dest[j * stride + i] = highbd_clip_pixel_add( 4055da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 4056da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 4057da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 4058da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 4059da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 4060da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#endif // CONFIG_VP9_HIGHBITDEPTH 4061