1b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian/* 2b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian * 4b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian * Use of this source code is governed by a BSD-style license 5b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian * that can be found in the LICENSE file in the root of the source 6b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian * tree. An additional intellectual property rights grant can be found 7b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian * in the file PATENTS. All contributing project authors may 8b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian * be found in the AUTHORS file in the root of the source tree. 9b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian */ 10b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 11b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#include <immintrin.h> // AVX2 12b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#include "vp9/common/vp9_idct.h" // for cospi constants 13b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#include "vpx_ports/mem.h" 14b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 15b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid vp9_fdct4x4_avx2(const int16_t *input, int16_t *output, int stride) { 16b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // The 2D transform is done with two passes which are actually pretty 17b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // similar. In the first one, we transform the columns and transpose 18b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // the results. In the second one, we transform the rows. To achieve that, 19b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // as the first pass results are transposed, we transpose the columns (that 20b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // is the transposed rows) and transpose the results (so that it goes back 21b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // in normal/row positions). 22b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian int pass; 23b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Constants 24b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // When we use them, in one case, they are all the same. In all others 25b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // it's a pair of them that we need to repeat four times. This is done 26b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // by constructing the 32 bit constant corresponding to that pair. 27b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 28b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 29b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 30b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 31b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 32b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); 33b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); 34b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i kOne = _mm_set1_epi16(1); 35b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i in0, in1, in2, in3; 36b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Load inputs. 37b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { 38b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); 39b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); 40b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in2 = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); 41b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in3 = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); 42b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // x = x << 4 43b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in0 = _mm_slli_epi16(in0, 4); 44b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in1 = _mm_slli_epi16(in1, 4); 45b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in2 = _mm_slli_epi16(in2, 4); 46b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in3 = _mm_slli_epi16(in3, 4); 47b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // if (i == 0 && input[0]) input[0] += 1; 48b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { 49b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // The mask will only contain whether the first value is zero, all 50b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // other comparison will fail as something shifted by 4 (above << 4) 51b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // can never be equal to one. To increment in the non-zero case, we 52b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // add the mask and one for the first element: 53b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // - if zero, mask = -1, v = v - 1 + 1 = v 54b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 55b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a); 56b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in0 = _mm_add_epi16(in0, mask); 57b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in0 = _mm_add_epi16(in0, k__nonzero_bias_b); 58b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 59b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 60b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Do the two transform/transpose passes 61b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian for (pass = 0; pass < 2; ++pass) { 62b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Transform 1/2: Add/subtract 63b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i r0 = _mm_add_epi16(in0, in3); 64b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i r1 = _mm_add_epi16(in1, in2); 65b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i r2 = _mm_sub_epi16(in1, in2); 66b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i r3 = _mm_sub_epi16(in0, in3); 67b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Transform 1/2: Interleave to do the multiply by constants which gets us 68b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // into 32 bits. 69b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t0 = _mm_unpacklo_epi16(r0, r1); 70b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t2 = _mm_unpacklo_epi16(r2, r3); 71b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); 72b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); 73b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); 74b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); 75b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 76b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 77b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 78b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 79b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 80b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 81b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 82b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 83b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Combine and transpose 84b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i res0 = _mm_packs_epi32(w0, w2); 85b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i res1 = _mm_packs_epi32(w4, w6); 86b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 00 01 02 03 20 21 22 23 87b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 10 11 12 13 30 31 32 33 88b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); 89b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1); 90b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 00 10 01 11 02 12 03 13 91b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 20 30 21 31 22 32 23 33 92b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 93b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 94b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 00 10 20 30 01 11 21 31 in0 contains 0 followed by 1 95b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 02 12 22 32 03 13 23 33 in2 contains 2 followed by 3 96b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian if (0 == pass) { 97b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Extract values in the high part for second pass as transform code 98b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // only uses the first four values. 99b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in1 = _mm_unpackhi_epi64(in0, in0); 100b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in3 = _mm_unpackhi_epi64(in2, in2); 101b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } else { 102b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Post-condition output and store it (v + 1) >> 2, taking advantage 103b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // of the fact 1/3 are stored just after 0/2. 104b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i out01 = _mm_add_epi16(in0, kOne); 105b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i out23 = _mm_add_epi16(in2, kOne); 106b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian out01 = _mm_srai_epi16(out01, 2); 107b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian out23 = _mm_srai_epi16(out23, 2); 108b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(output + 0 * 4), out01); 109b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(output + 2 * 4), out23); 110b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 111b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 112b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 113b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 114b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic INLINE void load_buffer_4x4_avx2(const int16_t *input, __m128i *in, 115b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian int stride) { 116b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); 117b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); 118b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i mask; 119b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 120b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); 121b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); 122b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); 123b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); 124b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 125b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[0] = _mm_slli_epi16(in[0], 4); 126b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[1] = _mm_slli_epi16(in[1], 4); 127b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[2] = _mm_slli_epi16(in[2], 4); 128b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[3] = _mm_slli_epi16(in[3], 4); 129b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 130b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a); 131b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[0] = _mm_add_epi16(in[0], mask); 132b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b); 133b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 134b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 135b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic INLINE void write_buffer_4x4_avx2(int16_t *output, __m128i *res) { 136b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i kOne = _mm_set1_epi16(1); 137b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]); 138b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]); 139b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i out01 = _mm_add_epi16(in01, kOne); 140b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i out23 = _mm_add_epi16(in23, kOne); 141b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian out01 = _mm_srai_epi16(out01, 2); 142b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian out23 = _mm_srai_epi16(out23, 2); 143b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_store_si128((__m128i *)(output + 0 * 8), out01); 144b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_store_si128((__m128i *)(output + 1 * 8), out23); 145b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 146b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 147b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic INLINE void transpose_4x4_avx2(__m128i *res) { 148b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Combine and transpose 149b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 00 01 02 03 20 21 22 23 150b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 10 11 12 13 30 31 32 33 151b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); 152b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); 153b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 154b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 00 10 01 11 02 12 03 13 155b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 20 30 21 31 22 32 23 33 156b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); 157b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); 158b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 159b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 00 10 20 30 01 11 21 31 160b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 02 12 22 32 03 13 23 33 161b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // only use the first 4 16-bit integers 162b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[1] = _mm_unpackhi_epi64(res[0], res[0]); 163b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[3] = _mm_unpackhi_epi64(res[2], res[2]); 164b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 165b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 166b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid fdct4_avx2(__m128i *in) { 167b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 168b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 169b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 170b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 171b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 172b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 173b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i u[4], v[4]; 174b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0]=_mm_unpacklo_epi16(in[0], in[1]); 175b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1]=_mm_unpacklo_epi16(in[3], in[2]); 176b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 177b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[0] = _mm_add_epi16(u[0], u[1]); 178b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[1] = _mm_sub_epi16(u[0], u[1]); 179b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 180b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0 181b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2 182b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1 183b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3 184b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 185b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 186b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 187b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 188b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 189b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 190b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 191b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 192b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 193b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 194b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[0] = _mm_packs_epi32(u[0], u[1]); 195b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[1] = _mm_packs_epi32(u[2], u[3]); 196b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian transpose_4x4_avx2(in); 197b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 198b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 199b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid fadst4_avx2(__m128i *in) { 200b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9); 201b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9); 202b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9); 203b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9); 204b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9); 205b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i kZero = _mm_set1_epi16(0); 206b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 207b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i u[8], v[8]; 208b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i in7 = _mm_add_epi16(in[0], in[1]); 209b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 210b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_unpacklo_epi16(in[0], in[1]); 211b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_unpacklo_epi16(in[2], in[3]); 212b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[2] = _mm_unpacklo_epi16(in7, kZero); 213b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[3] = _mm_unpacklo_epi16(in[2], kZero); 214b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[4] = _mm_unpacklo_epi16(in[3], kZero); 215b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 216b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2 217b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5 218b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1 219b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3 220b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6 221b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4 222b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03); 223b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 224b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_add_epi32(v[0], v[1]); 225b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_sub_epi32(v[2], v[6]); 226b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[2] = _mm_add_epi32(v[3], v[4]); 227b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[3] = _mm_sub_epi32(u[2], u[0]); 228b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[4] = _mm_slli_epi32(v[5], 2); 229b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[5] = _mm_sub_epi32(u[4], v[5]); 230b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[6] = _mm_add_epi32(u[3], u[5]); 231b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 232b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 233b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 234b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 235b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 236b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 237b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 238b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 239b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 240b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 241b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 242b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[0] = _mm_packs_epi32(u[0], u[2]); 243b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[1] = _mm_packs_epi32(u[1], u[3]); 244b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian transpose_4x4_avx2(in); 245b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 246b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 247b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid vp9_fht4x4_avx2(const int16_t *input, int16_t *output, 248b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian int stride, int tx_type) { 249b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i in[4]; 250b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 251b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian switch (tx_type) { 252b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian case DCT_DCT: 253b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian vp9_fdct4x4_avx2(input, output, stride); 254b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian break; 255b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian case ADST_DCT: 256b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian load_buffer_4x4_avx2(input, in, stride); 257b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian fadst4_avx2(in); 258b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian fdct4_avx2(in); 259b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian write_buffer_4x4_avx2(output, in); 260b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian break; 261b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian case DCT_ADST: 262b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian load_buffer_4x4_avx2(input, in, stride); 263b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian fdct4_avx2(in); 264b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian fadst4_avx2(in); 265b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian write_buffer_4x4_avx2(output, in); 266b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian break; 267b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian case ADST_ADST: 268b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian load_buffer_4x4_avx2(input, in, stride); 269b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian fadst4_avx2(in); 270b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian fadst4_avx2(in); 271b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian write_buffer_4x4_avx2(output, in); 272b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian break; 273b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian default: 274b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian assert(0); 275b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian break; 276b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 277b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 278b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 279b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid vp9_fdct8x8_avx2(const int16_t *input, int16_t *output, int stride) { 280b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian int pass; 281b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Constants 282b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // When we use them, in one case, they are all the same. In all others 283b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // it's a pair of them that we need to repeat four times. This is done 284b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // by constructing the 32 bit constant corresponding to that pair. 285b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 286b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 287b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 288b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 289b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 290b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 291b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 292b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 293b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 294b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Load input 295b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); 296b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); 297b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); 298b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); 299b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); 300b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); 301b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); 302b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); 303b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Pre-condition input (shift by two) 304b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in0 = _mm_slli_epi16(in0, 2); 305b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in1 = _mm_slli_epi16(in1, 2); 306b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in2 = _mm_slli_epi16(in2, 2); 307b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in3 = _mm_slli_epi16(in3, 2); 308b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in4 = _mm_slli_epi16(in4, 2); 309b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in5 = _mm_slli_epi16(in5, 2); 310b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in6 = _mm_slli_epi16(in6, 2); 311b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in7 = _mm_slli_epi16(in7, 2); 312b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 313b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // We do two passes, first the columns, then the rows. The results of the 314b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // first pass are transposed so that the same column code can be reused. The 315b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // results of the second pass are also transposed so that the rows (processed 316b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // as columns) are put back in row positions. 317b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian for (pass = 0; pass < 2; pass++) { 318b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // To store results of each pass before the transpose. 319b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i res0, res1, res2, res3, res4, res5, res6, res7; 320b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Add/subtract 321b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i q0 = _mm_add_epi16(in0, in7); 322b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i q1 = _mm_add_epi16(in1, in6); 323b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i q2 = _mm_add_epi16(in2, in5); 324b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i q3 = _mm_add_epi16(in3, in4); 325b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i q4 = _mm_sub_epi16(in3, in4); 326b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i q5 = _mm_sub_epi16(in2, in5); 327b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i q6 = _mm_sub_epi16(in1, in6); 328b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i q7 = _mm_sub_epi16(in0, in7); 329b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Work on first four results 330b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { 331b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Add/subtract 332b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i r0 = _mm_add_epi16(q0, q3); 333b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i r1 = _mm_add_epi16(q1, q2); 334b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i r2 = _mm_sub_epi16(q1, q2); 335b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i r3 = _mm_sub_epi16(q0, q3); 336b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Interleave to do the multiply by constants which gets us into 32bits 337b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t0 = _mm_unpacklo_epi16(r0, r1); 338b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t1 = _mm_unpackhi_epi16(r0, r1); 339b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t2 = _mm_unpacklo_epi16(r2, r3); 340b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t3 = _mm_unpackhi_epi16(r2, r3); 341b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); 342b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); 343b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); 344b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); 345b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); 346b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); 347b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); 348b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); 349b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // dct_const_round_shift 350b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 351b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 352b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 353b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 354b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 355b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 356b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 357b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 358b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 359b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 360b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 361b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 362b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 363b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 364b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 365b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 366b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Combine 367b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res0 = _mm_packs_epi32(w0, w1); 368b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res4 = _mm_packs_epi32(w2, w3); 369b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res2 = _mm_packs_epi32(w4, w5); 370b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res6 = _mm_packs_epi32(w6, w7); 371b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 372b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Work on next four results 373b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { 374b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Interleave to do the multiply by constants which gets us into 32bits 375b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i d0 = _mm_unpacklo_epi16(q6, q5); 376b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i d1 = _mm_unpackhi_epi16(q6, q5); 377b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); 378b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); 379b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); 380b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); 381b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // dct_const_round_shift 382b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); 383b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); 384b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); 385b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); 386b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); 387b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); 388b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); 389b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); 390b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Combine 391b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i r0 = _mm_packs_epi32(s0, s1); 392b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i r1 = _mm_packs_epi32(s2, s3); 393b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Add/subtract 394b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i x0 = _mm_add_epi16(q4, r0); 395b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i x1 = _mm_sub_epi16(q4, r0); 396b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i x2 = _mm_sub_epi16(q7, r1); 397b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i x3 = _mm_add_epi16(q7, r1); 398b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Interleave to do the multiply by constants which gets us into 32bits 399b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t0 = _mm_unpacklo_epi16(x0, x3); 400b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t1 = _mm_unpackhi_epi16(x0, x3); 401b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t2 = _mm_unpacklo_epi16(x1, x2); 402b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t3 = _mm_unpackhi_epi16(x1, x2); 403b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); 404b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); 405b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); 406b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); 407b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); 408b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); 409b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); 410b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); 411b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // dct_const_round_shift 412b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 413b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 414b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 415b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 416b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 417b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 418b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 419b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 420b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 421b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 422b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 423b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 424b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 425b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 426b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 427b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 428b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Combine 429b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res1 = _mm_packs_epi32(w0, w1); 430b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res7 = _mm_packs_epi32(w2, w3); 431b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res5 = _mm_packs_epi32(w4, w5); 432b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res3 = _mm_packs_epi32(w6, w7); 433b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 434b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Transpose the 8x8. 435b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { 436b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 00 01 02 03 04 05 06 07 437b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 10 11 12 13 14 15 16 17 438b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 20 21 22 23 24 25 26 27 439b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 30 31 32 33 34 35 36 37 440b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 40 41 42 43 44 45 46 47 441b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 50 51 52 53 54 55 56 57 442b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 60 61 62 63 64 65 66 67 443b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 70 71 72 73 74 75 76 77 444b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); 445b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); 446b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); 447b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); 448b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); 449b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); 450b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); 451b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); 452b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 00 10 01 11 02 12 03 13 453b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 20 30 21 31 22 32 23 33 454b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 04 14 05 15 06 16 07 17 455b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 24 34 25 35 26 36 27 37 456b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 40 50 41 51 42 52 43 53 457b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 60 70 61 71 62 72 63 73 458b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 54 54 55 55 56 56 57 57 459b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 64 74 65 75 66 76 67 77 460b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 461b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); 462b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 463b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); 464b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); 465b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 466b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); 467b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 468b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 00 10 20 30 01 11 21 31 469b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 40 50 60 70 41 51 61 71 470b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 02 12 22 32 03 13 23 33 471b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 42 52 62 72 43 53 63 73 472b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 04 14 24 34 05 15 21 36 473b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 44 54 64 74 45 55 61 76 474b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 06 16 26 36 07 17 27 37 475b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 46 56 66 76 47 57 67 77 476b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); 477b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); 478b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); 479b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); 480b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); 481b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); 482b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); 483b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); 484b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 00 10 20 30 40 50 60 70 485b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 01 11 21 31 41 51 61 71 486b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 02 12 22 32 42 52 62 72 487b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 03 13 23 33 43 53 63 73 488b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 04 14 24 34 44 54 64 74 489b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 05 15 25 35 45 55 65 75 490b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 06 16 26 36 46 56 66 76 491b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 07 17 27 37 47 57 67 77 492b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 493b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 494b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Post-condition output and store it 495b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { 496b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Post-condition (division by two) 497b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // division of two 16 bits signed numbers using shifts 498b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // n / 2 = (n - (n >> 15)) >> 1 499b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i sign_in0 = _mm_srai_epi16(in0, 15); 500b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i sign_in1 = _mm_srai_epi16(in1, 15); 501b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i sign_in2 = _mm_srai_epi16(in2, 15); 502b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i sign_in3 = _mm_srai_epi16(in3, 15); 503b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i sign_in4 = _mm_srai_epi16(in4, 15); 504b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i sign_in5 = _mm_srai_epi16(in5, 15); 505b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i sign_in6 = _mm_srai_epi16(in6, 15); 506b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i sign_in7 = _mm_srai_epi16(in7, 15); 507b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in0 = _mm_sub_epi16(in0, sign_in0); 508b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in1 = _mm_sub_epi16(in1, sign_in1); 509b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in2 = _mm_sub_epi16(in2, sign_in2); 510b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in3 = _mm_sub_epi16(in3, sign_in3); 511b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in4 = _mm_sub_epi16(in4, sign_in4); 512b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in5 = _mm_sub_epi16(in5, sign_in5); 513b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in6 = _mm_sub_epi16(in6, sign_in6); 514b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in7 = _mm_sub_epi16(in7, sign_in7); 515b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in0 = _mm_srai_epi16(in0, 1); 516b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in1 = _mm_srai_epi16(in1, 1); 517b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in2 = _mm_srai_epi16(in2, 1); 518b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in3 = _mm_srai_epi16(in3, 1); 519b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in4 = _mm_srai_epi16(in4, 1); 520b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in5 = _mm_srai_epi16(in5, 1); 521b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in6 = _mm_srai_epi16(in6, 1); 522b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in7 = _mm_srai_epi16(in7, 1); 523b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // store results 524b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_store_si128((__m128i *)(output + 0 * 8), in0); 525b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_store_si128((__m128i *)(output + 1 * 8), in1); 526b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_store_si128((__m128i *)(output + 2 * 8), in2); 527b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_store_si128((__m128i *)(output + 3 * 8), in3); 528b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_store_si128((__m128i *)(output + 4 * 8), in4); 529b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_store_si128((__m128i *)(output + 5 * 8), in5); 530b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_store_si128((__m128i *)(output + 6 * 8), in6); 531b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_store_si128((__m128i *)(output + 7 * 8), in7); 532b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 533b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 534b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 535b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian// load 8x8 array 536b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic INLINE void load_buffer_8x8_avx2(const int16_t *input, __m128i *in, 537b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian int stride) { 538b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); 539b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); 540b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); 541b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); 542b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); 543b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); 544b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); 545b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); 546b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 547b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[0] = _mm_slli_epi16(in[0], 2); 548b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[1] = _mm_slli_epi16(in[1], 2); 549b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[2] = _mm_slli_epi16(in[2], 2); 550b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[3] = _mm_slli_epi16(in[3], 2); 551b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[4] = _mm_slli_epi16(in[4], 2); 552b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[5] = _mm_slli_epi16(in[5], 2); 553b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[6] = _mm_slli_epi16(in[6], 2); 554b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[7] = _mm_slli_epi16(in[7], 2); 555b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 556b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 557b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian// right shift and rounding 558b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic INLINE void right_shift_8x8_avx2(__m128i *res, int const bit) { 559b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i kOne = _mm_set1_epi16(1); 560b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const int bit_m02 = bit - 2; 561b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i sign0 = _mm_srai_epi16(res[0], 15); 562b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i sign1 = _mm_srai_epi16(res[1], 15); 563b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i sign2 = _mm_srai_epi16(res[2], 15); 564b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i sign3 = _mm_srai_epi16(res[3], 15); 565b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i sign4 = _mm_srai_epi16(res[4], 15); 566b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i sign5 = _mm_srai_epi16(res[5], 15); 567b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i sign6 = _mm_srai_epi16(res[6], 15); 568b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i sign7 = _mm_srai_epi16(res[7], 15); 569b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 570b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian if (bit_m02 >= 0) { 571b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02); 572b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[0] = _mm_add_epi16(res[0], k_const_rounding); 573b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[1] = _mm_add_epi16(res[1], k_const_rounding); 574b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[2] = _mm_add_epi16(res[2], k_const_rounding); 575b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[3] = _mm_add_epi16(res[3], k_const_rounding); 576b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[4] = _mm_add_epi16(res[4], k_const_rounding); 577b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[5] = _mm_add_epi16(res[5], k_const_rounding); 578b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[6] = _mm_add_epi16(res[6], k_const_rounding); 579b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[7] = _mm_add_epi16(res[7], k_const_rounding); 580b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 581b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 582b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[0] = _mm_sub_epi16(res[0], sign0); 583b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[1] = _mm_sub_epi16(res[1], sign1); 584b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[2] = _mm_sub_epi16(res[2], sign2); 585b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[3] = _mm_sub_epi16(res[3], sign3); 586b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[4] = _mm_sub_epi16(res[4], sign4); 587b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[5] = _mm_sub_epi16(res[5], sign5); 588b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[6] = _mm_sub_epi16(res[6], sign6); 589b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[7] = _mm_sub_epi16(res[7], sign7); 590b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 591b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[0] = _mm_srai_epi16(res[0], bit); 592b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[1] = _mm_srai_epi16(res[1], bit); 593b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[2] = _mm_srai_epi16(res[2], bit); 594b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[3] = _mm_srai_epi16(res[3], bit); 595b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[4] = _mm_srai_epi16(res[4], bit); 596b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[5] = _mm_srai_epi16(res[5], bit); 597b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[6] = _mm_srai_epi16(res[6], bit); 598b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[7] = _mm_srai_epi16(res[7], bit); 599b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 600b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 601b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian// write 8x8 array 602b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic INLINE void write_buffer_8x8_avx2(int16_t *output, __m128i *res, int stride) { 603b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_store_si128((__m128i *)(output + 0 * stride), res[0]); 604b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_store_si128((__m128i *)(output + 1 * stride), res[1]); 605b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_store_si128((__m128i *)(output + 2 * stride), res[2]); 606b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_store_si128((__m128i *)(output + 3 * stride), res[3]); 607b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_store_si128((__m128i *)(output + 4 * stride), res[4]); 608b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_store_si128((__m128i *)(output + 5 * stride), res[5]); 609b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_store_si128((__m128i *)(output + 6 * stride), res[6]); 610b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_store_si128((__m128i *)(output + 7 * stride), res[7]); 611b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 612b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 613b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian// perform in-place transpose 614b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic INLINE void array_transpose_8x8_avx2(__m128i *in, __m128i *res) { 615b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); 616b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); 617b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); 618b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); 619b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); 620b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); 621b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); 622b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); 623b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 00 10 01 11 02 12 03 13 624b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 20 30 21 31 22 32 23 33 625b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 04 14 05 15 06 16 07 17 626b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 24 34 25 35 26 36 27 37 627b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 40 50 41 51 42 52 43 53 628b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 60 70 61 71 62 72 63 73 629b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 44 54 45 55 46 56 47 57 630b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 64 74 65 75 66 76 67 77 631b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 632b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); 633b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 634b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); 635b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); 636b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 637b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); 638b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 639b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 00 10 20 30 01 11 21 31 640b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 40 50 60 70 41 51 61 71 641b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 02 12 22 32 03 13 23 33 642b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 42 52 62 72 43 53 63 73 643b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 04 14 24 34 05 15 25 35 644b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 44 54 64 74 45 55 65 75 645b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 06 16 26 36 07 17 27 37 646b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 46 56 66 76 47 57 67 77 647b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); 648b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); 649b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); 650b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); 651b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); 652b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); 653b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); 654b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); 655b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 00 10 20 30 40 50 60 70 656b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 01 11 21 31 41 51 61 71 657b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 02 12 22 32 42 52 62 72 658b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 03 13 23 33 43 53 63 73 659b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 04 14 24 34 44 54 64 74 660b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 05 15 25 35 45 55 65 75 661b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 06 16 26 36 46 56 66 76 662b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 07 17 27 37 47 57 67 77 663b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 664b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 665b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid fdct8_avx2(__m128i *in) { 666b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // constants 667b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 668b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 669b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 670b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 671b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 672b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 673b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 674b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 675b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 676b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i u0, u1, u2, u3, u4, u5, u6, u7; 677b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i v0, v1, v2, v3, v4, v5, v6, v7; 678b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i s0, s1, s2, s3, s4, s5, s6, s7; 679b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 680b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // stage 1 681b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s0 = _mm_add_epi16(in[0], in[7]); 682b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s1 = _mm_add_epi16(in[1], in[6]); 683b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s2 = _mm_add_epi16(in[2], in[5]); 684b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s3 = _mm_add_epi16(in[3], in[4]); 685b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s4 = _mm_sub_epi16(in[3], in[4]); 686b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s5 = _mm_sub_epi16(in[2], in[5]); 687b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s6 = _mm_sub_epi16(in[1], in[6]); 688b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s7 = _mm_sub_epi16(in[0], in[7]); 689b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 690b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u0 = _mm_add_epi16(s0, s3); 691b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u1 = _mm_add_epi16(s1, s2); 692b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u2 = _mm_sub_epi16(s1, s2); 693b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u3 = _mm_sub_epi16(s0, s3); 694b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // interleave and perform butterfly multiplication/addition 695b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v0 = _mm_unpacklo_epi16(u0, u1); 696b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v1 = _mm_unpackhi_epi16(u0, u1); 697b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v2 = _mm_unpacklo_epi16(u2, u3); 698b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v3 = _mm_unpackhi_epi16(u2, u3); 699b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 700b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u0 = _mm_madd_epi16(v0, k__cospi_p16_p16); 701b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u1 = _mm_madd_epi16(v1, k__cospi_p16_p16); 702b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u2 = _mm_madd_epi16(v0, k__cospi_p16_m16); 703b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u3 = _mm_madd_epi16(v1, k__cospi_p16_m16); 704b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u4 = _mm_madd_epi16(v2, k__cospi_p24_p08); 705b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u5 = _mm_madd_epi16(v3, k__cospi_p24_p08); 706b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u6 = _mm_madd_epi16(v2, k__cospi_m08_p24); 707b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u7 = _mm_madd_epi16(v3, k__cospi_m08_p24); 708b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 709b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // shift and rounding 710b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 711b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 712b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 713b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 714b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 715b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 716b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 717b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 718b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 719b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 720b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 721b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 722b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 723b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 724b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 725b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 726b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 727b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 728b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[0] = _mm_packs_epi32(u0, u1); 729b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[2] = _mm_packs_epi32(u4, u5); 730b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[4] = _mm_packs_epi32(u2, u3); 731b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[6] = _mm_packs_epi32(u6, u7); 732b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 733b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // stage 2 734b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // interleave and perform butterfly multiplication/addition 735b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u0 = _mm_unpacklo_epi16(s6, s5); 736b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u1 = _mm_unpackhi_epi16(s6, s5); 737b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v0 = _mm_madd_epi16(u0, k__cospi_p16_m16); 738b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v1 = _mm_madd_epi16(u1, k__cospi_p16_m16); 739b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v2 = _mm_madd_epi16(u0, k__cospi_p16_p16); 740b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v3 = _mm_madd_epi16(u1, k__cospi_p16_p16); 741b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 742b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // shift and rounding 743b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 744b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 745b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 746b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 747b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 748b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 749b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 750b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 751b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 752b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 753b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u0 = _mm_packs_epi32(v0, v1); 754b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u1 = _mm_packs_epi32(v2, v3); 755b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 756b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // stage 3 757b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s0 = _mm_add_epi16(s4, u0); 758b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s1 = _mm_sub_epi16(s4, u0); 759b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s2 = _mm_sub_epi16(s7, u1); 760b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s3 = _mm_add_epi16(s7, u1); 761b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 762b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // stage 4 763b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u0 = _mm_unpacklo_epi16(s0, s3); 764b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u1 = _mm_unpackhi_epi16(s0, s3); 765b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u2 = _mm_unpacklo_epi16(s1, s2); 766b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u3 = _mm_unpackhi_epi16(s1, s2); 767b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 768b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v0 = _mm_madd_epi16(u0, k__cospi_p28_p04); 769b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v1 = _mm_madd_epi16(u1, k__cospi_p28_p04); 770b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v2 = _mm_madd_epi16(u2, k__cospi_p12_p20); 771b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v3 = _mm_madd_epi16(u3, k__cospi_p12_p20); 772b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v4 = _mm_madd_epi16(u2, k__cospi_m20_p12); 773b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v5 = _mm_madd_epi16(u3, k__cospi_m20_p12); 774b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v6 = _mm_madd_epi16(u0, k__cospi_m04_p28); 775b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v7 = _mm_madd_epi16(u1, k__cospi_m04_p28); 776b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 777b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // shift and rounding 778b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 779b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 780b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 781b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 782b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); 783b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); 784b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); 785b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); 786b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 787b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 788b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 789b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 790b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 791b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); 792b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); 793b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); 794b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); 795b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 796b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[1] = _mm_packs_epi32(v0, v1); 797b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[3] = _mm_packs_epi32(v4, v5); 798b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[5] = _mm_packs_epi32(v2, v3); 799b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[7] = _mm_packs_epi32(v6, v7); 800b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 801b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // transpose 802b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian array_transpose_8x8_avx2(in, in); 803b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 804b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 805b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid fadst8_avx2(__m128i *in) { 806b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Constants 807b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); 808b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); 809b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); 810b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); 811b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); 812b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); 813b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); 814b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); 815b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 816b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 817b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 818b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 819b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 820b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__const_0 = _mm_set1_epi16(0); 821b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 822b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 823b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; 824b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; 825b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; 826b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i s0, s1, s2, s3, s4, s5, s6, s7; 827b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i in0, in1, in2, in3, in4, in5, in6, in7; 828b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 829b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // properly aligned for butterfly input 830b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in0 = in[7]; 831b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in1 = in[0]; 832b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in2 = in[5]; 833b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in3 = in[2]; 834b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in4 = in[3]; 835b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in5 = in[4]; 836b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in6 = in[1]; 837b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in7 = in[6]; 838b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 839b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // column transformation 840b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // stage 1 841b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // interleave and multiply/add into 32-bit integer 842b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s0 = _mm_unpacklo_epi16(in0, in1); 843b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s1 = _mm_unpackhi_epi16(in0, in1); 844b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s2 = _mm_unpacklo_epi16(in2, in3); 845b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s3 = _mm_unpackhi_epi16(in2, in3); 846b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s4 = _mm_unpacklo_epi16(in4, in5); 847b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s5 = _mm_unpackhi_epi16(in4, in5); 848b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s6 = _mm_unpacklo_epi16(in6, in7); 849b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s7 = _mm_unpackhi_epi16(in6, in7); 850b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 851b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); 852b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); 853b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); 854b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); 855b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); 856b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); 857b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); 858b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); 859b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); 860b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); 861b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); 862b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); 863b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); 864b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); 865b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); 866b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); 867b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 868b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // addition 869b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian w0 = _mm_add_epi32(u0, u8); 870b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian w1 = _mm_add_epi32(u1, u9); 871b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian w2 = _mm_add_epi32(u2, u10); 872b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian w3 = _mm_add_epi32(u3, u11); 873b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian w4 = _mm_add_epi32(u4, u12); 874b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian w5 = _mm_add_epi32(u5, u13); 875b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian w6 = _mm_add_epi32(u6, u14); 876b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian w7 = _mm_add_epi32(u7, u15); 877b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian w8 = _mm_sub_epi32(u0, u8); 878b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian w9 = _mm_sub_epi32(u1, u9); 879b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian w10 = _mm_sub_epi32(u2, u10); 880b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian w11 = _mm_sub_epi32(u3, u11); 881b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian w12 = _mm_sub_epi32(u4, u12); 882b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian w13 = _mm_sub_epi32(u5, u13); 883b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian w14 = _mm_sub_epi32(u6, u14); 884b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian w15 = _mm_sub_epi32(u7, u15); 885b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 886b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // shift and rounding 887b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 888b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 889b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 890b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 891b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 892b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 893b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 894b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 895b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); 896b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); 897b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); 898b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); 899b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); 900b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); 901b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); 902b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); 903b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 904b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 905b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 906b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 907b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 908b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 909b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 910b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 911b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 912b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); 913b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); 914b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); 915b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); 916b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); 917b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); 918b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); 919b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); 920b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 921b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // back to 16-bit and pack 8 integers into __m128i 922b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[0] = _mm_packs_epi32(u0, u1); 923b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[1] = _mm_packs_epi32(u2, u3); 924b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[2] = _mm_packs_epi32(u4, u5); 925b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[3] = _mm_packs_epi32(u6, u7); 926b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[4] = _mm_packs_epi32(u8, u9); 927b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[5] = _mm_packs_epi32(u10, u11); 928b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[6] = _mm_packs_epi32(u12, u13); 929b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[7] = _mm_packs_epi32(u14, u15); 930b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 931b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // stage 2 932b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s0 = _mm_add_epi16(in[0], in[2]); 933b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s1 = _mm_add_epi16(in[1], in[3]); 934b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s2 = _mm_sub_epi16(in[0], in[2]); 935b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s3 = _mm_sub_epi16(in[1], in[3]); 936b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u0 = _mm_unpacklo_epi16(in[4], in[5]); 937b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u1 = _mm_unpackhi_epi16(in[4], in[5]); 938b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u2 = _mm_unpacklo_epi16(in[6], in[7]); 939b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u3 = _mm_unpackhi_epi16(in[6], in[7]); 940b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 941b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); 942b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); 943b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); 944b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); 945b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); 946b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); 947b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); 948b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); 949b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 950b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian w0 = _mm_add_epi32(v0, v4); 951b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian w1 = _mm_add_epi32(v1, v5); 952b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian w2 = _mm_add_epi32(v2, v6); 953b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian w3 = _mm_add_epi32(v3, v7); 954b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian w4 = _mm_sub_epi32(v0, v4); 955b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian w5 = _mm_sub_epi32(v1, v5); 956b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian w6 = _mm_sub_epi32(v2, v6); 957b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian w7 = _mm_sub_epi32(v3, v7); 958b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 959b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 960b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 961b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 962b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 963b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 964b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 965b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 966b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 967b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 968b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 969b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 970b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 971b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 972b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 973b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 974b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 975b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 976b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 977b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // back to 16-bit intergers 978b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s4 = _mm_packs_epi32(u0, u1); 979b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s5 = _mm_packs_epi32(u2, u3); 980b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s6 = _mm_packs_epi32(u4, u5); 981b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s7 = _mm_packs_epi32(u6, u7); 982b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 983b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // stage 3 984b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u0 = _mm_unpacklo_epi16(s2, s3); 985b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u1 = _mm_unpackhi_epi16(s2, s3); 986b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u2 = _mm_unpacklo_epi16(s6, s7); 987b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u3 = _mm_unpackhi_epi16(s6, s7); 988b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 989b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); 990b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); 991b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); 992b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); 993b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); 994b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); 995b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); 996b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); 997b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 998b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 999b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 1000b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 1001b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 1002b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); 1003b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); 1004b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); 1005b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); 1006b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1007b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 1008b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 1009b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 1010b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 1011b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); 1012b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); 1013b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); 1014b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); 1015b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1016b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s2 = _mm_packs_epi32(v0, v1); 1017b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s3 = _mm_packs_epi32(v2, v3); 1018b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s6 = _mm_packs_epi32(v4, v5); 1019b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s7 = _mm_packs_epi32(v6, v7); 1020b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1021b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // FIXME(jingning): do subtract using bit inversion? 1022b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[0] = s0; 1023b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[1] = _mm_sub_epi16(k__const_0, s4); 1024b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[2] = s6; 1025b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[3] = _mm_sub_epi16(k__const_0, s2); 1026b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[4] = s3; 1027b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[5] = _mm_sub_epi16(k__const_0, s7); 1028b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[6] = s5; 1029b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[7] = _mm_sub_epi16(k__const_0, s1); 1030b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1031b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // transpose 1032b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian array_transpose_8x8_avx2(in, in); 1033b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 1034b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1035b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid vp9_fht8x8_avx2(const int16_t *input, int16_t *output, 1036b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian int stride, int tx_type) { 1037b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i in[8]; 1038b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1039b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian switch (tx_type) { 1040b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian case DCT_DCT: 1041b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian vp9_fdct8x8_avx2(input, output, stride); 1042b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian break; 1043b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian case ADST_DCT: 1044b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian load_buffer_8x8_avx2(input, in, stride); 1045b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian fadst8_avx2(in); 1046b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian fdct8_avx2(in); 1047b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian right_shift_8x8_avx2(in, 1); 1048b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian write_buffer_8x8_avx2(output, in, 8); 1049b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian break; 1050b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian case DCT_ADST: 1051b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian load_buffer_8x8_avx2(input, in, stride); 1052b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian fdct8_avx2(in); 1053b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian fadst8_avx2(in); 1054b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian right_shift_8x8_avx2(in, 1); 1055b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian write_buffer_8x8_avx2(output, in, 8); 1056b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian break; 1057b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian case ADST_ADST: 1058b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian load_buffer_8x8_avx2(input, in, stride); 1059b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian fadst8_avx2(in); 1060b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian fadst8_avx2(in); 1061b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian right_shift_8x8_avx2(in, 1); 1062b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian write_buffer_8x8_avx2(output, in, 8); 1063b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian break; 1064b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian default: 1065b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian assert(0); 1066b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian break; 1067b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 1068b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 1069b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1070b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) { 1071b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // The 2D transform is done with two passes which are actually pretty 1072b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // similar. In the first one, we transform the columns and transpose 1073b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // the results. In the second one, we transform the rows. To achieve that, 1074b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // as the first pass results are transposed, we transpose the columns (that 1075b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // is the transposed rows) and transpose the results (so that it goes back 1076b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // in normal/row positions). 1077b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian int pass; 1078b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // We need an intermediate buffer between passes. 1079b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256); 1080b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const int16_t *in = input; 1081b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian int16_t *out = intermediate; 1082b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Constants 1083b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // When we use them, in one case, they are all the same. In all others 1084b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // it's a pair of them that we need to repeat four times. This is done 1085b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // by constructing the 32 bit constant corresponding to that pair. 1086b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 1087b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1088b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 1089b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1090b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1091b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 1092b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 1093b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 1094b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 1095b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); 1096b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); 1097b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); 1098b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); 1099b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); 1100b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); 1101b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); 1102b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); 1103b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1104b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i kOne = _mm_set1_epi16(1); 1105b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Do the two transform/transpose passes 1106b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian for (pass = 0; pass < 2; ++pass) { 1107b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // We process eight columns (transposed rows in second pass) at a time. 1108b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian int column_start; 1109b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian for (column_start = 0; column_start < 16; column_start += 8) { 1110b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i in00, in01, in02, in03, in04, in05, in06, in07; 1111b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i in08, in09, in10, in11, in12, in13, in14, in15; 1112b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i input0, input1, input2, input3, input4, input5, input6, input7; 1113b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i step1_0, step1_1, step1_2, step1_3; 1114b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i step1_4, step1_5, step1_6, step1_7; 1115b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; 1116b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i step3_0, step3_1, step3_2, step3_3; 1117b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i step3_4, step3_5, step3_6, step3_7; 1118b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i res00, res01, res02, res03, res04, res05, res06, res07; 1119b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i res08, res09, res10, res11, res12, res13, res14, res15; 1120b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Load and pre-condition input. 1121b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian if (0 == pass) { 1122b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in00 = _mm_load_si128((const __m128i *)(in + 0 * stride)); 1123b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in01 = _mm_load_si128((const __m128i *)(in + 1 * stride)); 1124b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in02 = _mm_load_si128((const __m128i *)(in + 2 * stride)); 1125b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in03 = _mm_load_si128((const __m128i *)(in + 3 * stride)); 1126b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in04 = _mm_load_si128((const __m128i *)(in + 4 * stride)); 1127b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in05 = _mm_load_si128((const __m128i *)(in + 5 * stride)); 1128b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in06 = _mm_load_si128((const __m128i *)(in + 6 * stride)); 1129b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in07 = _mm_load_si128((const __m128i *)(in + 7 * stride)); 1130b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in08 = _mm_load_si128((const __m128i *)(in + 8 * stride)); 1131b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in09 = _mm_load_si128((const __m128i *)(in + 9 * stride)); 1132b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in10 = _mm_load_si128((const __m128i *)(in + 10 * stride)); 1133b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in11 = _mm_load_si128((const __m128i *)(in + 11 * stride)); 1134b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in12 = _mm_load_si128((const __m128i *)(in + 12 * stride)); 1135b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in13 = _mm_load_si128((const __m128i *)(in + 13 * stride)); 1136b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in14 = _mm_load_si128((const __m128i *)(in + 14 * stride)); 1137b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in15 = _mm_load_si128((const __m128i *)(in + 15 * stride)); 1138b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // x = x << 2 1139b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in00 = _mm_slli_epi16(in00, 2); 1140b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in01 = _mm_slli_epi16(in01, 2); 1141b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in02 = _mm_slli_epi16(in02, 2); 1142b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in03 = _mm_slli_epi16(in03, 2); 1143b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in04 = _mm_slli_epi16(in04, 2); 1144b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in05 = _mm_slli_epi16(in05, 2); 1145b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in06 = _mm_slli_epi16(in06, 2); 1146b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in07 = _mm_slli_epi16(in07, 2); 1147b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in08 = _mm_slli_epi16(in08, 2); 1148b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in09 = _mm_slli_epi16(in09, 2); 1149b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in10 = _mm_slli_epi16(in10, 2); 1150b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in11 = _mm_slli_epi16(in11, 2); 1151b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in12 = _mm_slli_epi16(in12, 2); 1152b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in13 = _mm_slli_epi16(in13, 2); 1153b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in14 = _mm_slli_epi16(in14, 2); 1154b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in15 = _mm_slli_epi16(in15, 2); 1155b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } else { 1156b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in00 = _mm_load_si128((const __m128i *)(in + 0 * 16)); 1157b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in01 = _mm_load_si128((const __m128i *)(in + 1 * 16)); 1158b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in02 = _mm_load_si128((const __m128i *)(in + 2 * 16)); 1159b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in03 = _mm_load_si128((const __m128i *)(in + 3 * 16)); 1160b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in04 = _mm_load_si128((const __m128i *)(in + 4 * 16)); 1161b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in05 = _mm_load_si128((const __m128i *)(in + 5 * 16)); 1162b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in06 = _mm_load_si128((const __m128i *)(in + 6 * 16)); 1163b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in07 = _mm_load_si128((const __m128i *)(in + 7 * 16)); 1164b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in08 = _mm_load_si128((const __m128i *)(in + 8 * 16)); 1165b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in09 = _mm_load_si128((const __m128i *)(in + 9 * 16)); 1166b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in10 = _mm_load_si128((const __m128i *)(in + 10 * 16)); 1167b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in11 = _mm_load_si128((const __m128i *)(in + 11 * 16)); 1168b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in12 = _mm_load_si128((const __m128i *)(in + 12 * 16)); 1169b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in13 = _mm_load_si128((const __m128i *)(in + 13 * 16)); 1170b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in14 = _mm_load_si128((const __m128i *)(in + 14 * 16)); 1171b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in15 = _mm_load_si128((const __m128i *)(in + 15 * 16)); 1172b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // x = (x + 1) >> 2 1173b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in00 = _mm_add_epi16(in00, kOne); 1174b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in01 = _mm_add_epi16(in01, kOne); 1175b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in02 = _mm_add_epi16(in02, kOne); 1176b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in03 = _mm_add_epi16(in03, kOne); 1177b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in04 = _mm_add_epi16(in04, kOne); 1178b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in05 = _mm_add_epi16(in05, kOne); 1179b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in06 = _mm_add_epi16(in06, kOne); 1180b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in07 = _mm_add_epi16(in07, kOne); 1181b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in08 = _mm_add_epi16(in08, kOne); 1182b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in09 = _mm_add_epi16(in09, kOne); 1183b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in10 = _mm_add_epi16(in10, kOne); 1184b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in11 = _mm_add_epi16(in11, kOne); 1185b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in12 = _mm_add_epi16(in12, kOne); 1186b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in13 = _mm_add_epi16(in13, kOne); 1187b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in14 = _mm_add_epi16(in14, kOne); 1188b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in15 = _mm_add_epi16(in15, kOne); 1189b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in00 = _mm_srai_epi16(in00, 2); 1190b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in01 = _mm_srai_epi16(in01, 2); 1191b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in02 = _mm_srai_epi16(in02, 2); 1192b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in03 = _mm_srai_epi16(in03, 2); 1193b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in04 = _mm_srai_epi16(in04, 2); 1194b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in05 = _mm_srai_epi16(in05, 2); 1195b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in06 = _mm_srai_epi16(in06, 2); 1196b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in07 = _mm_srai_epi16(in07, 2); 1197b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in08 = _mm_srai_epi16(in08, 2); 1198b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in09 = _mm_srai_epi16(in09, 2); 1199b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in10 = _mm_srai_epi16(in10, 2); 1200b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in11 = _mm_srai_epi16(in11, 2); 1201b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in12 = _mm_srai_epi16(in12, 2); 1202b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in13 = _mm_srai_epi16(in13, 2); 1203b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in14 = _mm_srai_epi16(in14, 2); 1204b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in15 = _mm_srai_epi16(in15, 2); 1205b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 1206b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in += 8; 1207b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Calculate input for the first 8 results. 1208b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { 1209b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian input0 = _mm_add_epi16(in00, in15); 1210b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian input1 = _mm_add_epi16(in01, in14); 1211b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian input2 = _mm_add_epi16(in02, in13); 1212b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian input3 = _mm_add_epi16(in03, in12); 1213b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian input4 = _mm_add_epi16(in04, in11); 1214b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian input5 = _mm_add_epi16(in05, in10); 1215b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian input6 = _mm_add_epi16(in06, in09); 1216b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian input7 = _mm_add_epi16(in07, in08); 1217b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 1218b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Calculate input for the next 8 results. 1219b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { 1220b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step1_0 = _mm_sub_epi16(in07, in08); 1221b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step1_1 = _mm_sub_epi16(in06, in09); 1222b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step1_2 = _mm_sub_epi16(in05, in10); 1223b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step1_3 = _mm_sub_epi16(in04, in11); 1224b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step1_4 = _mm_sub_epi16(in03, in12); 1225b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step1_5 = _mm_sub_epi16(in02, in13); 1226b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step1_6 = _mm_sub_epi16(in01, in14); 1227b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step1_7 = _mm_sub_epi16(in00, in15); 1228b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 1229b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Work on the first eight values; fdct8(input, even_results); 1230b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { 1231b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Add/subtract 1232b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i q0 = _mm_add_epi16(input0, input7); 1233b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i q1 = _mm_add_epi16(input1, input6); 1234b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i q2 = _mm_add_epi16(input2, input5); 1235b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i q3 = _mm_add_epi16(input3, input4); 1236b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i q4 = _mm_sub_epi16(input3, input4); 1237b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i q5 = _mm_sub_epi16(input2, input5); 1238b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i q6 = _mm_sub_epi16(input1, input6); 1239b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i q7 = _mm_sub_epi16(input0, input7); 1240b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Work on first four results 1241b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { 1242b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Add/subtract 1243b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i r0 = _mm_add_epi16(q0, q3); 1244b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i r1 = _mm_add_epi16(q1, q2); 1245b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i r2 = _mm_sub_epi16(q1, q2); 1246b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i r3 = _mm_sub_epi16(q0, q3); 1247b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Interleave to do the multiply by constants which gets us 1248b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // into 32 bits. 1249b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t0 = _mm_unpacklo_epi16(r0, r1); 1250b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t1 = _mm_unpackhi_epi16(r0, r1); 1251b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t2 = _mm_unpacklo_epi16(r2, r3); 1252b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t3 = _mm_unpackhi_epi16(r2, r3); 1253b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); 1254b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); 1255b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); 1256b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); 1257b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); 1258b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); 1259b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); 1260b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); 1261b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // dct_const_round_shift 1262b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1263b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1264b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1265b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1266b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 1267b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 1268b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 1269b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 1270b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1271b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1272b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1273b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1274b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 1275b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 1276b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 1277b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 1278b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Combine 1279b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res00 = _mm_packs_epi32(w0, w1); 1280b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res08 = _mm_packs_epi32(w2, w3); 1281b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res04 = _mm_packs_epi32(w4, w5); 1282b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res12 = _mm_packs_epi32(w6, w7); 1283b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 1284b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Work on next four results 1285b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { 1286b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Interleave to do the multiply by constants which gets us 1287b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // into 32 bits. 1288b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i d0 = _mm_unpacklo_epi16(q6, q5); 1289b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i d1 = _mm_unpackhi_epi16(q6, q5); 1290b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); 1291b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); 1292b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); 1293b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); 1294b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // dct_const_round_shift 1295b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); 1296b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); 1297b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); 1298b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); 1299b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); 1300b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); 1301b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); 1302b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); 1303b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Combine 1304b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i r0 = _mm_packs_epi32(s0, s1); 1305b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i r1 = _mm_packs_epi32(s2, s3); 1306b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Add/subtract 1307b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i x0 = _mm_add_epi16(q4, r0); 1308b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i x1 = _mm_sub_epi16(q4, r0); 1309b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i x2 = _mm_sub_epi16(q7, r1); 1310b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i x3 = _mm_add_epi16(q7, r1); 1311b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Interleave to do the multiply by constants which gets us 1312b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // into 32 bits. 1313b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t0 = _mm_unpacklo_epi16(x0, x3); 1314b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t1 = _mm_unpackhi_epi16(x0, x3); 1315b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t2 = _mm_unpacklo_epi16(x1, x2); 1316b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t3 = _mm_unpackhi_epi16(x1, x2); 1317b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); 1318b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); 1319b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); 1320b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); 1321b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); 1322b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); 1323b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); 1324b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); 1325b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // dct_const_round_shift 1326b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1327b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1328b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1329b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1330b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 1331b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 1332b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 1333b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 1334b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1335b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1336b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1337b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1338b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 1339b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 1340b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 1341b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 1342b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Combine 1343b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res02 = _mm_packs_epi32(w0, w1); 1344b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res14 = _mm_packs_epi32(w2, w3); 1345b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res10 = _mm_packs_epi32(w4, w5); 1346b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res06 = _mm_packs_epi32(w6, w7); 1347b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 1348b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 1349b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Work on the next eight values; step1 -> odd_results 1350b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { 1351b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // step 2 1352b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { 1353b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); 1354b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); 1355b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); 1356b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); 1357b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16); 1358b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16); 1359b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16); 1360b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16); 1361b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // dct_const_round_shift 1362b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1363b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1364b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1365b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1366b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1367b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1368b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1369b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1370b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Combine 1371b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step2_2 = _mm_packs_epi32(w0, w1); 1372b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step2_3 = _mm_packs_epi32(w2, w3); 1373b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 1374b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { 1375b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); 1376b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); 1377b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); 1378b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); 1379b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); 1380b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); 1381b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16); 1382b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16); 1383b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // dct_const_round_shift 1384b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1385b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1386b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1387b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1388b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1389b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1390b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1391b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1392b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Combine 1393b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step2_5 = _mm_packs_epi32(w0, w1); 1394b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step2_4 = _mm_packs_epi32(w2, w3); 1395b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 1396b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // step 3 1397b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { 1398b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step3_0 = _mm_add_epi16(step1_0, step2_3); 1399b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step3_1 = _mm_add_epi16(step1_1, step2_2); 1400b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step3_2 = _mm_sub_epi16(step1_1, step2_2); 1401b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step3_3 = _mm_sub_epi16(step1_0, step2_3); 1402b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step3_4 = _mm_sub_epi16(step1_7, step2_4); 1403b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step3_5 = _mm_sub_epi16(step1_6, step2_5); 1404b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step3_6 = _mm_add_epi16(step1_6, step2_5); 1405b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step3_7 = _mm_add_epi16(step1_7, step2_4); 1406b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 1407b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // step 4 1408b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { 1409b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); 1410b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); 1411b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); 1412b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); 1413b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24); 1414b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24); 1415b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08); 1416b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08); 1417b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // dct_const_round_shift 1418b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1419b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1420b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1421b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1422b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1423b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1424b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1425b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1426b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Combine 1427b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step2_1 = _mm_packs_epi32(w0, w1); 1428b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step2_2 = _mm_packs_epi32(w2, w3); 1429b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 1430b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { 1431b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); 1432b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); 1433b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); 1434b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); 1435b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08); 1436b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08); 1437b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24); 1438b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24); 1439b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // dct_const_round_shift 1440b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1441b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1442b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1443b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1444b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1445b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1446b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1447b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1448b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Combine 1449b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step2_6 = _mm_packs_epi32(w0, w1); 1450b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step2_5 = _mm_packs_epi32(w2, w3); 1451b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 1452b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // step 5 1453b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { 1454b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step1_0 = _mm_add_epi16(step3_0, step2_1); 1455b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step1_1 = _mm_sub_epi16(step3_0, step2_1); 1456b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step1_2 = _mm_sub_epi16(step3_3, step2_2); 1457b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step1_3 = _mm_add_epi16(step3_3, step2_2); 1458b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step1_4 = _mm_add_epi16(step3_4, step2_5); 1459b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step1_5 = _mm_sub_epi16(step3_4, step2_5); 1460b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step1_6 = _mm_sub_epi16(step3_7, step2_6); 1461b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian step1_7 = _mm_add_epi16(step3_7, step2_6); 1462b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 1463b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // step 6 1464b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { 1465b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); 1466b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); 1467b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); 1468b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); 1469b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02); 1470b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02); 1471b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18); 1472b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18); 1473b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // dct_const_round_shift 1474b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1475b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1476b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1477b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1478b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1479b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1480b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1481b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1482b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Combine 1483b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res01 = _mm_packs_epi32(w0, w1); 1484b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res09 = _mm_packs_epi32(w2, w3); 1485b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 1486b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { 1487b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); 1488b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); 1489b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); 1490b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); 1491b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10); 1492b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10); 1493b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26); 1494b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26); 1495b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // dct_const_round_shift 1496b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1497b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1498b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1499b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1500b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1501b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1502b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1503b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1504b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Combine 1505b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res05 = _mm_packs_epi32(w0, w1); 1506b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res13 = _mm_packs_epi32(w2, w3); 1507b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 1508b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { 1509b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); 1510b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); 1511b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); 1512b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); 1513b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22); 1514b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22); 1515b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06); 1516b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06); 1517b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // dct_const_round_shift 1518b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1519b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1520b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1521b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1522b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1523b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1524b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1525b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1526b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Combine 1527b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res11 = _mm_packs_epi32(w0, w1); 1528b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res03 = _mm_packs_epi32(w2, w3); 1529b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 1530b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { 1531b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); 1532b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); 1533b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); 1534b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); 1535b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30); 1536b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30); 1537b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14); 1538b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14); 1539b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // dct_const_round_shift 1540b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1541b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1542b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1543b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1544b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1545b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1546b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1547b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1548b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Combine 1549b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res15 = _mm_packs_epi32(w0, w1); 1550b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res07 = _mm_packs_epi32(w2, w3); 1551b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 1552b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 1553b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Transpose the results, do it as two 8x8 transposes. 1554b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { 1555b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 00 01 02 03 04 05 06 07 1556b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 10 11 12 13 14 15 16 17 1557b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 20 21 22 23 24 25 26 27 1558b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 30 31 32 33 34 35 36 37 1559b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 40 41 42 43 44 45 46 47 1560b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 50 51 52 53 54 55 56 57 1561b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 60 61 62 63 64 65 66 67 1562b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 70 71 72 73 74 75 76 77 1563b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01); 1564b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03); 1565b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01); 1566b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03); 1567b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05); 1568b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07); 1569b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05); 1570b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07); 1571b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 00 10 01 11 02 12 03 13 1572b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 20 30 21 31 22 32 23 33 1573b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 04 14 05 15 06 16 07 17 1574b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 24 34 25 35 26 36 27 37 1575b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 40 50 41 51 42 52 43 53 1576b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 60 70 61 71 62 72 63 73 1577b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 54 54 55 55 56 56 57 57 1578b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 64 74 65 75 66 76 67 77 1579b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 1580b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); 1581b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 1582b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); 1583b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); 1584b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 1585b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); 1586b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 1587b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 00 10 20 30 01 11 21 31 1588b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 40 50 60 70 41 51 61 71 1589b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 02 12 22 32 03 13 23 33 1590b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 42 52 62 72 43 53 63 73 1591b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 04 14 24 34 05 15 21 36 1592b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 44 54 64 74 45 55 61 76 1593b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 06 16 26 36 07 17 27 37 1594b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 46 56 66 76 47 57 67 77 1595b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); 1596b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); 1597b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); 1598b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); 1599b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); 1600b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); 1601b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); 1602b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); 1603b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 00 10 20 30 40 50 60 70 1604b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 01 11 21 31 41 51 61 71 1605b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 02 12 22 32 42 52 62 72 1606b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 03 13 23 33 43 53 63 73 1607b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 04 14 24 34 44 54 64 74 1608b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 05 15 25 35 45 55 65 75 1609b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 06 16 26 36 46 56 66 76 1610b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 07 17 27 37 47 57 67 77 1611b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0); 1612b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1); 1613b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2); 1614b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3); 1615b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4); 1616b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5); 1617b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6); 1618b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7); 1619b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 1620b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian { 1621b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 00 01 02 03 04 05 06 07 1622b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 10 11 12 13 14 15 16 17 1623b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 20 21 22 23 24 25 26 27 1624b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 30 31 32 33 34 35 36 37 1625b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 40 41 42 43 44 45 46 47 1626b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 50 51 52 53 54 55 56 57 1627b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 60 61 62 63 64 65 66 67 1628b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 70 71 72 73 74 75 76 77 1629b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09); 1630b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11); 1631b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09); 1632b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11); 1633b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13); 1634b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15); 1635b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13); 1636b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15); 1637b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 00 10 01 11 02 12 03 13 1638b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 20 30 21 31 22 32 23 33 1639b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 04 14 05 15 06 16 07 17 1640b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 24 34 25 35 26 36 27 37 1641b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 40 50 41 51 42 52 43 53 1642b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 60 70 61 71 62 72 63 73 1643b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 54 54 55 55 56 56 57 57 1644b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 64 74 65 75 66 76 67 77 1645b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 1646b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); 1647b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 1648b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); 1649b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); 1650b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 1651b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); 1652b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 1653b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 00 10 20 30 01 11 21 31 1654b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 40 50 60 70 41 51 61 71 1655b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 02 12 22 32 03 13 23 33 1656b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 42 52 62 72 43 53 63 73 1657b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 04 14 24 34 05 15 21 36 1658b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 44 54 64 74 45 55 61 76 1659b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 06 16 26 36 07 17 27 37 1660b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 46 56 66 76 47 57 67 77 1661b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); 1662b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); 1663b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); 1664b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); 1665b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); 1666b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); 1667b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); 1668b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); 1669b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 00 10 20 30 40 50 60 70 1670b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 01 11 21 31 41 51 61 71 1671b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 02 12 22 32 42 52 62 72 1672b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 03 13 23 33 43 53 63 73 1673b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 04 14 24 34 44 54 64 74 1674b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 05 15 25 35 45 55 65 75 1675b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 06 16 26 36 46 56 66 76 1676b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // 07 17 27 37 47 57 67 77 1677b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Store results 1678b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_store_si128((__m128i *)(out + 8 + 0 * 16), tr2_0); 1679b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_store_si128((__m128i *)(out + 8 + 1 * 16), tr2_1); 1680b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_store_si128((__m128i *)(out + 8 + 2 * 16), tr2_2); 1681b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_store_si128((__m128i *)(out + 8 + 3 * 16), tr2_3); 1682b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_store_si128((__m128i *)(out + 8 + 4 * 16), tr2_4); 1683b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_store_si128((__m128i *)(out + 8 + 5 * 16), tr2_5); 1684b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_store_si128((__m128i *)(out + 8 + 6 * 16), tr2_6); 1685b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7); 1686b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 1687b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian out += 8*16; 1688b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 1689b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // Setup in/out for next pass. 1690b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in = intermediate; 1691b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian out = output; 1692b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 1693b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 1694b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1695b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic INLINE void load_buffer_16x16_avx2(const int16_t* input, __m128i *in0, 1696b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i *in1, int stride) { 1697b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // load first 8 columns 1698b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian load_buffer_8x8_avx2(input, in0, stride); 1699b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian load_buffer_8x8_avx2(input + 8 * stride, in0 + 8, stride); 1700b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1701b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian input += 8; 1702b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // load second 8 columns 1703b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian load_buffer_8x8_avx2(input, in1, stride); 1704b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian load_buffer_8x8_avx2(input + 8 * stride, in1 + 8, stride); 1705b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 1706b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1707b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic INLINE void write_buffer_16x16_avx2(int16_t *output, __m128i *in0, 1708b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i *in1, int stride) { 1709b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // write first 8 columns 1710b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian write_buffer_8x8_avx2(output, in0, stride); 1711b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian write_buffer_8x8_avx2(output + 8 * stride, in0 + 8, stride); 1712b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // write second 8 columns 1713b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian output += 8; 1714b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian write_buffer_8x8_avx2(output, in1, stride); 1715b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian write_buffer_8x8_avx2(output + 8 * stride, in1 + 8, stride); 1716b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 1717b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1718b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic INLINE void array_transpose_16x16_avx2(__m128i *res0, __m128i *res1) { 1719b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i tbuf[8]; 1720b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian array_transpose_8x8_avx2(res0, res0); 1721b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian array_transpose_8x8_avx2(res1, tbuf); 1722b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian array_transpose_8x8_avx2(res0 + 8, res1); 1723b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian array_transpose_8x8_avx2(res1 + 8, res1 + 8); 1724b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1725b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res0[8] = tbuf[0]; 1726b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res0[9] = tbuf[1]; 1727b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res0[10] = tbuf[2]; 1728b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res0[11] = tbuf[3]; 1729b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res0[12] = tbuf[4]; 1730b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res0[13] = tbuf[5]; 1731b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res0[14] = tbuf[6]; 1732b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian res0[15] = tbuf[7]; 1733b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 1734b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1735b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic INLINE void right_shift_16x16_avx2(__m128i *res0, __m128i *res1) { 1736b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // perform rounding operations 1737b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian right_shift_8x8_avx2(res0, 2); 1738b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian right_shift_8x8_avx2(res0 + 8, 2); 1739b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian right_shift_8x8_avx2(res1, 2); 1740b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian right_shift_8x8_avx2(res1 + 8, 2); 1741b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 1742b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1743b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid fdct16_8col_avx2(__m128i *in) { 1744b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // perform 16x16 1-D DCT for 8 columns 1745b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i i[8], s[8], p[8], t[8], u[16], v[16]; 1746b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 1747b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1748b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1749b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 1750b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1751b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1752b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 1753b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 1754b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 1755b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 1756b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); 1757b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); 1758b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); 1759b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); 1760b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); 1761b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); 1762b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); 1763b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); 1764b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1765b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1766b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // stage 1 1767b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian i[0] = _mm_add_epi16(in[0], in[15]); 1768b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian i[1] = _mm_add_epi16(in[1], in[14]); 1769b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian i[2] = _mm_add_epi16(in[2], in[13]); 1770b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian i[3] = _mm_add_epi16(in[3], in[12]); 1771b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian i[4] = _mm_add_epi16(in[4], in[11]); 1772b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian i[5] = _mm_add_epi16(in[5], in[10]); 1773b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian i[6] = _mm_add_epi16(in[6], in[9]); 1774b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian i[7] = _mm_add_epi16(in[7], in[8]); 1775b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1776b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[0] = _mm_sub_epi16(in[7], in[8]); 1777b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[1] = _mm_sub_epi16(in[6], in[9]); 1778b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[2] = _mm_sub_epi16(in[5], in[10]); 1779b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[3] = _mm_sub_epi16(in[4], in[11]); 1780b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[4] = _mm_sub_epi16(in[3], in[12]); 1781b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[5] = _mm_sub_epi16(in[2], in[13]); 1782b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[6] = _mm_sub_epi16(in[1], in[14]); 1783b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[7] = _mm_sub_epi16(in[0], in[15]); 1784b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1785b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian p[0] = _mm_add_epi16(i[0], i[7]); 1786b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian p[1] = _mm_add_epi16(i[1], i[6]); 1787b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian p[2] = _mm_add_epi16(i[2], i[5]); 1788b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian p[3] = _mm_add_epi16(i[3], i[4]); 1789b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian p[4] = _mm_sub_epi16(i[3], i[4]); 1790b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian p[5] = _mm_sub_epi16(i[2], i[5]); 1791b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian p[6] = _mm_sub_epi16(i[1], i[6]); 1792b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian p[7] = _mm_sub_epi16(i[0], i[7]); 1793b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1794b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_add_epi16(p[0], p[3]); 1795b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_add_epi16(p[1], p[2]); 1796b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[2] = _mm_sub_epi16(p[1], p[2]); 1797b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[3] = _mm_sub_epi16(p[0], p[3]); 1798b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1799b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[0] = _mm_unpacklo_epi16(u[0], u[1]); 1800b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[1] = _mm_unpackhi_epi16(u[0], u[1]); 1801b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[2] = _mm_unpacklo_epi16(u[2], u[3]); 1802b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[3] = _mm_unpackhi_epi16(u[2], u[3]); 1803b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1804b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); 1805b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16); 1806b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16); 1807b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16); 1808b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08); 1809b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08); 1810b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24); 1811b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24); 1812b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1813b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1814b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1815b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1816b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1817b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1818b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1819b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1820b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1821b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1822b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1823b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1824b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1825b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1826b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1827b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1828b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1829b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1830b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1831b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[0] = _mm_packs_epi32(u[0], u[1]); 1832b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[4] = _mm_packs_epi32(u[4], u[5]); 1833b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[8] = _mm_packs_epi32(u[2], u[3]); 1834b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[12] = _mm_packs_epi32(u[6], u[7]); 1835b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1836b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_unpacklo_epi16(p[5], p[6]); 1837b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_unpackhi_epi16(p[5], p[6]); 1838b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 1839b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 1840b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 1841b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 1842b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1843b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1844b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1845b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1846b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1847b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1848b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1849b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1850b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1851b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1852b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1853b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_packs_epi32(v[0], v[1]); 1854b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_packs_epi32(v[2], v[3]); 1855b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1856b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian t[0] = _mm_add_epi16(p[4], u[0]); 1857b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian t[1] = _mm_sub_epi16(p[4], u[0]); 1858b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian t[2] = _mm_sub_epi16(p[7], u[1]); 1859b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian t[3] = _mm_add_epi16(p[7], u[1]); 1860b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1861b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_unpacklo_epi16(t[0], t[3]); 1862b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_unpackhi_epi16(t[0], t[3]); 1863b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[2] = _mm_unpacklo_epi16(t[1], t[2]); 1864b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[3] = _mm_unpackhi_epi16(t[1], t[2]); 1865b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1866b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04); 1867b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04); 1868b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20); 1869b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20); 1870b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12); 1871b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12); 1872b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28); 1873b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28); 1874b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1875b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1876b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1877b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1878b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1879b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1880b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1881b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1882b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1883b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1884b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1885b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1886b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1887b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1888b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1889b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1890b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1891b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1892b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1893b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[2] = _mm_packs_epi32(v[0], v[1]); 1894b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[6] = _mm_packs_epi32(v[4], v[5]); 1895b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[10] = _mm_packs_epi32(v[2], v[3]); 1896b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[14] = _mm_packs_epi32(v[6], v[7]); 1897b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1898b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // stage 2 1899b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_unpacklo_epi16(s[2], s[5]); 1900b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_unpackhi_epi16(s[2], s[5]); 1901b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[2] = _mm_unpacklo_epi16(s[3], s[4]); 1902b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[3] = _mm_unpackhi_epi16(s[3], s[4]); 1903b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1904b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 1905b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 1906b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 1907b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 1908b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 1909b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 1910b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 1911b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 1912b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1913b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1914b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1915b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1916b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1917b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1918b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1919b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1920b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1921b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1922b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1923b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1924b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1925b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1926b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1927b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1928b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1929b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1930b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1931b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian t[2] = _mm_packs_epi32(v[0], v[1]); 1932b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian t[3] = _mm_packs_epi32(v[2], v[3]); 1933b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian t[4] = _mm_packs_epi32(v[4], v[5]); 1934b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian t[5] = _mm_packs_epi32(v[6], v[7]); 1935b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1936b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // stage 3 1937b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian p[0] = _mm_add_epi16(s[0], t[3]); 1938b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian p[1] = _mm_add_epi16(s[1], t[2]); 1939b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian p[2] = _mm_sub_epi16(s[1], t[2]); 1940b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian p[3] = _mm_sub_epi16(s[0], t[3]); 1941b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian p[4] = _mm_sub_epi16(s[7], t[4]); 1942b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian p[5] = _mm_sub_epi16(s[6], t[5]); 1943b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian p[6] = _mm_add_epi16(s[6], t[5]); 1944b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian p[7] = _mm_add_epi16(s[7], t[4]); 1945b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1946b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // stage 4 1947b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_unpacklo_epi16(p[1], p[6]); 1948b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_unpackhi_epi16(p[1], p[6]); 1949b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[2] = _mm_unpacklo_epi16(p[2], p[5]); 1950b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[3] = _mm_unpackhi_epi16(p[2], p[5]); 1951b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1952b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24); 1953b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24); 1954b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08); 1955b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08); 1956b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24); 1957b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24); 1958b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08); 1959b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08); 1960b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1961b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1962b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1963b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1964b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1965b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1966b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1967b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1968b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1969b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1970b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1971b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1972b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1973b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1974b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1975b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1976b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1977b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1978b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1979b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian t[1] = _mm_packs_epi32(v[0], v[1]); 1980b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian t[2] = _mm_packs_epi32(v[2], v[3]); 1981b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian t[5] = _mm_packs_epi32(v[4], v[5]); 1982b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian t[6] = _mm_packs_epi32(v[6], v[7]); 1983b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1984b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // stage 5 1985b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[0] = _mm_add_epi16(p[0], t[1]); 1986b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[1] = _mm_sub_epi16(p[0], t[1]); 1987b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[2] = _mm_sub_epi16(p[3], t[2]); 1988b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[3] = _mm_add_epi16(p[3], t[2]); 1989b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[4] = _mm_add_epi16(p[4], t[5]); 1990b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[5] = _mm_sub_epi16(p[4], t[5]); 1991b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[6] = _mm_sub_epi16(p[7], t[6]); 1992b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[7] = _mm_add_epi16(p[7], t[6]); 1993b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 1994b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // stage 6 1995b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_unpacklo_epi16(s[0], s[7]); 1996b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_unpackhi_epi16(s[0], s[7]); 1997b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[2] = _mm_unpacklo_epi16(s[1], s[6]); 1998b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[3] = _mm_unpackhi_epi16(s[1], s[6]); 1999b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[4] = _mm_unpacklo_epi16(s[2], s[5]); 2000b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[5] = _mm_unpackhi_epi16(s[2], s[5]); 2001b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[6] = _mm_unpacklo_epi16(s[3], s[4]); 2002b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[7] = _mm_unpackhi_epi16(s[3], s[4]); 2003b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2004b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02); 2005b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02); 2006b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18); 2007b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18); 2008b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10); 2009b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10); 2010b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26); 2011b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26); 2012b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06); 2013b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06); 2014b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22); 2015b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22); 2016b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14); 2017b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14); 2018b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30); 2019b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30); 2020b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2021b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2022b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2023b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2024b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2025b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 2026b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 2027b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 2028b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 2029b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 2030b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 2031b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 2032b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 2033b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 2034b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 2035b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 2036b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 2037b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2038b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2039b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2040b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2041b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2042b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 2043b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 2044b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 2045b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 2046b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 2047b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 2048b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 2049b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 2050b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 2051b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 2052b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 2053b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 2054b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2055b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[1] = _mm_packs_epi32(v[0], v[1]); 2056b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[9] = _mm_packs_epi32(v[2], v[3]); 2057b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[5] = _mm_packs_epi32(v[4], v[5]); 2058b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[13] = _mm_packs_epi32(v[6], v[7]); 2059b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[3] = _mm_packs_epi32(v[8], v[9]); 2060b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[11] = _mm_packs_epi32(v[10], v[11]); 2061b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[7] = _mm_packs_epi32(v[12], v[13]); 2062b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[15] = _mm_packs_epi32(v[14], v[15]); 2063b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 2064b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2065b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid fadst16_8col_avx2(__m128i *in) { 2066b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // perform 16x16 1-D ADST for 8 columns 2067b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i s[16], x[16], u[32], v[32]; 2068b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); 2069b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); 2070b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); 2071b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); 2072b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); 2073b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); 2074b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); 2075b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); 2076b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); 2077b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); 2078b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); 2079b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); 2080b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); 2081b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); 2082b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); 2083b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); 2084b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); 2085b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); 2086b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); 2087b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); 2088b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); 2089b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); 2090b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 2091b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 2092b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 2093b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); 2094b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 2095b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 2096b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 2097b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 2098b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian const __m128i kZero = _mm_set1_epi16(0); 2099b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2100b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_unpacklo_epi16(in[15], in[0]); 2101b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_unpackhi_epi16(in[15], in[0]); 2102b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[2] = _mm_unpacklo_epi16(in[13], in[2]); 2103b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[3] = _mm_unpackhi_epi16(in[13], in[2]); 2104b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[4] = _mm_unpacklo_epi16(in[11], in[4]); 2105b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[5] = _mm_unpackhi_epi16(in[11], in[4]); 2106b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[6] = _mm_unpacklo_epi16(in[9], in[6]); 2107b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[7] = _mm_unpackhi_epi16(in[9], in[6]); 2108b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[8] = _mm_unpacklo_epi16(in[7], in[8]); 2109b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[9] = _mm_unpackhi_epi16(in[7], in[8]); 2110b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[10] = _mm_unpacklo_epi16(in[5], in[10]); 2111b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[11] = _mm_unpackhi_epi16(in[5], in[10]); 2112b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[12] = _mm_unpacklo_epi16(in[3], in[12]); 2113b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[13] = _mm_unpackhi_epi16(in[3], in[12]); 2114b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[14] = _mm_unpacklo_epi16(in[1], in[14]); 2115b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[15] = _mm_unpackhi_epi16(in[1], in[14]); 2116b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2117b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); 2118b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); 2119b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); 2120b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); 2121b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); 2122b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); 2123b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); 2124b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); 2125b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); 2126b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); 2127b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); 2128b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); 2129b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); 2130b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); 2131b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); 2132b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); 2133b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); 2134b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); 2135b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); 2136b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); 2137b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); 2138b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); 2139b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); 2140b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); 2141b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); 2142b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); 2143b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); 2144b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); 2145b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); 2146b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); 2147b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); 2148b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); 2149b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2150b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_add_epi32(v[0], v[16]); 2151b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_add_epi32(v[1], v[17]); 2152b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[2] = _mm_add_epi32(v[2], v[18]); 2153b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[3] = _mm_add_epi32(v[3], v[19]); 2154b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[4] = _mm_add_epi32(v[4], v[20]); 2155b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[5] = _mm_add_epi32(v[5], v[21]); 2156b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[6] = _mm_add_epi32(v[6], v[22]); 2157b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[7] = _mm_add_epi32(v[7], v[23]); 2158b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[8] = _mm_add_epi32(v[8], v[24]); 2159b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[9] = _mm_add_epi32(v[9], v[25]); 2160b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[10] = _mm_add_epi32(v[10], v[26]); 2161b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[11] = _mm_add_epi32(v[11], v[27]); 2162b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[12] = _mm_add_epi32(v[12], v[28]); 2163b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[13] = _mm_add_epi32(v[13], v[29]); 2164b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[14] = _mm_add_epi32(v[14], v[30]); 2165b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[15] = _mm_add_epi32(v[15], v[31]); 2166b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[16] = _mm_sub_epi32(v[0], v[16]); 2167b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[17] = _mm_sub_epi32(v[1], v[17]); 2168b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[18] = _mm_sub_epi32(v[2], v[18]); 2169b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[19] = _mm_sub_epi32(v[3], v[19]); 2170b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[20] = _mm_sub_epi32(v[4], v[20]); 2171b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[21] = _mm_sub_epi32(v[5], v[21]); 2172b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[22] = _mm_sub_epi32(v[6], v[22]); 2173b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[23] = _mm_sub_epi32(v[7], v[23]); 2174b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[24] = _mm_sub_epi32(v[8], v[24]); 2175b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[25] = _mm_sub_epi32(v[9], v[25]); 2176b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[26] = _mm_sub_epi32(v[10], v[26]); 2177b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[27] = _mm_sub_epi32(v[11], v[27]); 2178b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[28] = _mm_sub_epi32(v[12], v[28]); 2179b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[29] = _mm_sub_epi32(v[13], v[29]); 2180b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[30] = _mm_sub_epi32(v[14], v[30]); 2181b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[31] = _mm_sub_epi32(v[15], v[31]); 2182b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2183b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 2184b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 2185b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 2186b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 2187b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 2188b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 2189b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 2190b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 2191b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 2192b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 2193b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 2194b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 2195b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 2196b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 2197b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 2198b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 2199b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); 2200b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); 2201b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); 2202b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); 2203b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); 2204b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); 2205b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); 2206b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); 2207b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); 2208b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); 2209b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); 2210b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); 2211b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); 2212b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); 2213b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); 2214b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); 2215b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2216b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 2217b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 2218b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 2219b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 2220b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 2221b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 2222b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 2223b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 2224b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 2225b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 2226b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 2227b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 2228b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 2229b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 2230b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 2231b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 2232b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); 2233b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); 2234b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); 2235b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); 2236b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); 2237b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); 2238b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); 2239b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); 2240b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); 2241b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); 2242b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); 2243b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); 2244b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); 2245b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); 2246b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); 2247b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); 2248b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2249b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[0] = _mm_packs_epi32(u[0], u[1]); 2250b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[1] = _mm_packs_epi32(u[2], u[3]); 2251b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[2] = _mm_packs_epi32(u[4], u[5]); 2252b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[3] = _mm_packs_epi32(u[6], u[7]); 2253b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[4] = _mm_packs_epi32(u[8], u[9]); 2254b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[5] = _mm_packs_epi32(u[10], u[11]); 2255b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[6] = _mm_packs_epi32(u[12], u[13]); 2256b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[7] = _mm_packs_epi32(u[14], u[15]); 2257b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[8] = _mm_packs_epi32(u[16], u[17]); 2258b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[9] = _mm_packs_epi32(u[18], u[19]); 2259b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[10] = _mm_packs_epi32(u[20], u[21]); 2260b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[11] = _mm_packs_epi32(u[22], u[23]); 2261b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[12] = _mm_packs_epi32(u[24], u[25]); 2262b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[13] = _mm_packs_epi32(u[26], u[27]); 2263b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[14] = _mm_packs_epi32(u[28], u[29]); 2264b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[15] = _mm_packs_epi32(u[30], u[31]); 2265b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2266b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // stage 2 2267b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_unpacklo_epi16(s[8], s[9]); 2268b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_unpackhi_epi16(s[8], s[9]); 2269b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[2] = _mm_unpacklo_epi16(s[10], s[11]); 2270b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[3] = _mm_unpackhi_epi16(s[10], s[11]); 2271b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[4] = _mm_unpacklo_epi16(s[12], s[13]); 2272b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[5] = _mm_unpackhi_epi16(s[12], s[13]); 2273b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[6] = _mm_unpacklo_epi16(s[14], s[15]); 2274b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[7] = _mm_unpackhi_epi16(s[14], s[15]); 2275b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2276b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); 2277b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); 2278b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); 2279b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); 2280b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); 2281b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); 2282b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); 2283b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); 2284b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); 2285b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); 2286b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); 2287b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); 2288b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); 2289b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); 2290b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); 2291b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); 2292b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2293b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_add_epi32(v[0], v[8]); 2294b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_add_epi32(v[1], v[9]); 2295b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[2] = _mm_add_epi32(v[2], v[10]); 2296b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[3] = _mm_add_epi32(v[3], v[11]); 2297b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[4] = _mm_add_epi32(v[4], v[12]); 2298b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[5] = _mm_add_epi32(v[5], v[13]); 2299b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[6] = _mm_add_epi32(v[6], v[14]); 2300b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[7] = _mm_add_epi32(v[7], v[15]); 2301b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[8] = _mm_sub_epi32(v[0], v[8]); 2302b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[9] = _mm_sub_epi32(v[1], v[9]); 2303b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[10] = _mm_sub_epi32(v[2], v[10]); 2304b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[11] = _mm_sub_epi32(v[3], v[11]); 2305b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[12] = _mm_sub_epi32(v[4], v[12]); 2306b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[13] = _mm_sub_epi32(v[5], v[13]); 2307b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[14] = _mm_sub_epi32(v[6], v[14]); 2308b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[15] = _mm_sub_epi32(v[7], v[15]); 2309b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2310b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 2311b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 2312b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 2313b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 2314b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 2315b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 2316b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 2317b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 2318b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 2319b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 2320b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 2321b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 2322b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 2323b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 2324b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 2325b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 2326b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2327b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 2328b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 2329b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 2330b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 2331b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 2332b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 2333b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 2334b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 2335b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 2336b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 2337b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 2338b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 2339b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 2340b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 2341b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 2342b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 2343b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2344b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian x[0] = _mm_add_epi16(s[0], s[4]); 2345b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian x[1] = _mm_add_epi16(s[1], s[5]); 2346b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian x[2] = _mm_add_epi16(s[2], s[6]); 2347b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian x[3] = _mm_add_epi16(s[3], s[7]); 2348b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian x[4] = _mm_sub_epi16(s[0], s[4]); 2349b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian x[5] = _mm_sub_epi16(s[1], s[5]); 2350b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian x[6] = _mm_sub_epi16(s[2], s[6]); 2351b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian x[7] = _mm_sub_epi16(s[3], s[7]); 2352b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian x[8] = _mm_packs_epi32(u[0], u[1]); 2353b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian x[9] = _mm_packs_epi32(u[2], u[3]); 2354b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian x[10] = _mm_packs_epi32(u[4], u[5]); 2355b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian x[11] = _mm_packs_epi32(u[6], u[7]); 2356b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian x[12] = _mm_packs_epi32(u[8], u[9]); 2357b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian x[13] = _mm_packs_epi32(u[10], u[11]); 2358b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian x[14] = _mm_packs_epi32(u[12], u[13]); 2359b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian x[15] = _mm_packs_epi32(u[14], u[15]); 2360b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2361b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // stage 3 2362b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_unpacklo_epi16(x[4], x[5]); 2363b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_unpackhi_epi16(x[4], x[5]); 2364b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[2] = _mm_unpacklo_epi16(x[6], x[7]); 2365b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[3] = _mm_unpackhi_epi16(x[6], x[7]); 2366b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[4] = _mm_unpacklo_epi16(x[12], x[13]); 2367b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[5] = _mm_unpackhi_epi16(x[12], x[13]); 2368b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[6] = _mm_unpacklo_epi16(x[14], x[15]); 2369b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[7] = _mm_unpackhi_epi16(x[14], x[15]); 2370b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2371b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); 2372b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); 2373b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); 2374b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); 2375b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); 2376b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); 2377b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); 2378b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); 2379b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); 2380b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); 2381b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); 2382b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); 2383b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); 2384b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); 2385b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); 2386b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); 2387b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2388b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_add_epi32(v[0], v[4]); 2389b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_add_epi32(v[1], v[5]); 2390b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[2] = _mm_add_epi32(v[2], v[6]); 2391b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[3] = _mm_add_epi32(v[3], v[7]); 2392b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[4] = _mm_sub_epi32(v[0], v[4]); 2393b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[5] = _mm_sub_epi32(v[1], v[5]); 2394b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[6] = _mm_sub_epi32(v[2], v[6]); 2395b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[7] = _mm_sub_epi32(v[3], v[7]); 2396b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[8] = _mm_add_epi32(v[8], v[12]); 2397b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[9] = _mm_add_epi32(v[9], v[13]); 2398b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[10] = _mm_add_epi32(v[10], v[14]); 2399b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[11] = _mm_add_epi32(v[11], v[15]); 2400b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[12] = _mm_sub_epi32(v[8], v[12]); 2401b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[13] = _mm_sub_epi32(v[9], v[13]); 2402b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[14] = _mm_sub_epi32(v[10], v[14]); 2403b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[15] = _mm_sub_epi32(v[11], v[15]); 2404b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2405b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 2406b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 2407b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 2408b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 2409b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 2410b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 2411b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 2412b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 2413b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 2414b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 2415b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 2416b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 2417b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 2418b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 2419b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 2420b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 2421b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2422b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2423b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2424b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2425b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2426b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 2427b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 2428b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 2429b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 2430b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 2431b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 2432b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 2433b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 2434b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 2435b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 2436b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 2437b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 2438b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2439b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[0] = _mm_add_epi16(x[0], x[2]); 2440b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[1] = _mm_add_epi16(x[1], x[3]); 2441b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[2] = _mm_sub_epi16(x[0], x[2]); 2442b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[3] = _mm_sub_epi16(x[1], x[3]); 2443b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[4] = _mm_packs_epi32(v[0], v[1]); 2444b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[5] = _mm_packs_epi32(v[2], v[3]); 2445b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[6] = _mm_packs_epi32(v[4], v[5]); 2446b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[7] = _mm_packs_epi32(v[6], v[7]); 2447b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[8] = _mm_add_epi16(x[8], x[10]); 2448b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[9] = _mm_add_epi16(x[9], x[11]); 2449b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[10] = _mm_sub_epi16(x[8], x[10]); 2450b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[11] = _mm_sub_epi16(x[9], x[11]); 2451b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[12] = _mm_packs_epi32(v[8], v[9]); 2452b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[13] = _mm_packs_epi32(v[10], v[11]); 2453b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[14] = _mm_packs_epi32(v[12], v[13]); 2454b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian s[15] = _mm_packs_epi32(v[14], v[15]); 2455b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2456b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // stage 4 2457b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_unpacklo_epi16(s[2], s[3]); 2458b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_unpackhi_epi16(s[2], s[3]); 2459b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[2] = _mm_unpacklo_epi16(s[6], s[7]); 2460b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[3] = _mm_unpackhi_epi16(s[6], s[7]); 2461b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[4] = _mm_unpacklo_epi16(s[10], s[11]); 2462b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[5] = _mm_unpackhi_epi16(s[10], s[11]); 2463b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[6] = _mm_unpacklo_epi16(s[14], s[15]); 2464b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[7] = _mm_unpackhi_epi16(s[14], s[15]); 2465b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2466b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); 2467b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); 2468b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 2469b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); 2470b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 2471b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 2472b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 2473b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 2474b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); 2475b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); 2476b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); 2477b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); 2478b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); 2479b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); 2480b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); 2481b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); 2482b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2483b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2484b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2485b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2486b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2487b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 2488b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 2489b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 2490b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 2491b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 2492b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 2493b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 2494b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 2495b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 2496b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 2497b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 2498b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 2499b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2500b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2501b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2502b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2503b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2504b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 2505b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 2506b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 2507b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 2508b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 2509b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 2510b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 2511b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 2512b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 2513b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 2514b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 2515b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 2516b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2517b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[0] = s[0]; 2518b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[1] = _mm_sub_epi16(kZero, s[8]); 2519b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[2] = s[12]; 2520b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[3] = _mm_sub_epi16(kZero, s[4]); 2521b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[4] = _mm_packs_epi32(v[4], v[5]); 2522b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[5] = _mm_packs_epi32(v[12], v[13]); 2523b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[6] = _mm_packs_epi32(v[8], v[9]); 2524b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[7] = _mm_packs_epi32(v[0], v[1]); 2525b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[8] = _mm_packs_epi32(v[2], v[3]); 2526b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[9] = _mm_packs_epi32(v[10], v[11]); 2527b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[10] = _mm_packs_epi32(v[14], v[15]); 2528b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[11] = _mm_packs_epi32(v[6], v[7]); 2529b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[12] = s[5]; 2530b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[13] = _mm_sub_epi16(kZero, s[13]); 2531b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[14] = s[9]; 2532b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian in[15] = _mm_sub_epi16(kZero, s[1]); 2533b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 2534b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2535b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid fdct16_avx2(__m128i *in0, __m128i *in1) { 2536b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian fdct16_8col_avx2(in0); 2537b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian fdct16_8col_avx2(in1); 2538b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian array_transpose_16x16_avx2(in0, in1); 2539b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 2540b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2541b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid fadst16_avx2(__m128i *in0, __m128i *in1) { 2542b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian fadst16_8col_avx2(in0); 2543b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian fadst16_8col_avx2(in1); 2544b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian array_transpose_16x16_avx2(in0, in1); 2545b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 2546b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2547b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid vp9_fht16x16_avx2(const int16_t *input, int16_t *output, 2548b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian int stride, int tx_type) { 2549b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i in0[16], in1[16]; 2550b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2551b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian switch (tx_type) { 2552b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian case DCT_DCT: 2553b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian vp9_fdct16x16_avx2(input, output, stride); 2554b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian break; 2555b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian case ADST_DCT: 2556b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian load_buffer_16x16_avx2(input, in0, in1, stride); 2557b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian fadst16_avx2(in0, in1); 2558b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian right_shift_16x16_avx2(in0, in1); 2559b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian fdct16_avx2(in0, in1); 2560b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian write_buffer_16x16_avx2(output, in0, in1, 16); 2561b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian break; 2562b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian case DCT_ADST: 2563b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian load_buffer_16x16_avx2(input, in0, in1, stride); 2564b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian fdct16_avx2(in0, in1); 2565b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian right_shift_16x16_avx2(in0, in1); 2566b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian fadst16_avx2(in0, in1); 2567b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian write_buffer_16x16_avx2(output, in0, in1, 16); 2568b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian break; 2569b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian case ADST_ADST: 2570b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian load_buffer_16x16_avx2(input, in0, in1, stride); 2571b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian fadst16_avx2(in0, in1); 2572b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian right_shift_16x16_avx2(in0, in1); 2573b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian fadst16_avx2(in0, in1); 2574b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian write_buffer_16x16_avx2(output, in0, in1, 16); 2575b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian break; 2576b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian default: 2577b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian assert(0); 2578b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian break; 2579b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 2580b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 2581b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2582b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#define FDCT32x32_2D_AVX2 vp9_fdct32x32_rd_avx2 2583b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#define FDCT32x32_HIGH_PRECISION 0 2584b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#include "vp9/encoder/x86/vp9_dct32x32_avx2.c" 2585b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#undef FDCT32x32_2D_AVX2 2586b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#undef FDCT32x32_HIGH_PRECISION 2587b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2588b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#define FDCT32x32_2D_AVX2 vp9_fdct32x32_avx2 2589b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#define FDCT32x32_HIGH_PRECISION 1 2590b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#include "vp9/encoder/x86/vp9_dct32x32_avx2.c" // NOLINT 2591b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#undef FDCT32x32_2D_AVX2 2592b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#undef FDCT32x32_HIGH_PRECISION 2593