1b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian/*
2b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian *
4b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian *  Use of this source code is governed by a BSD-style license
5b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian *  that can be found in the LICENSE file in the root of the source
6b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian *  tree. An additional intellectual property rights grant can be found
7b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian *  in the file PATENTS.  All contributing project authors may
8b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian *  be found in the AUTHORS file in the root of the source tree.
9b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian */
10b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
11b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#include <immintrin.h>  // AVX2
12b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#include "vp9/common/vp9_idct.h"  // for cospi constants
13b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#include "vpx_ports/mem.h"
14b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
15b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid vp9_fdct4x4_avx2(const int16_t *input, int16_t *output, int stride) {
16b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // The 2D transform is done with two passes which are actually pretty
17b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // similar. In the first one, we transform the columns and transpose
18b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // the results. In the second one, we transform the rows. To achieve that,
19b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // as the first pass results are transposed, we transpose the columns (that
20b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // is the transposed rows) and transpose the results (so that it goes back
21b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // in normal/row positions).
22b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  int pass;
23b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // Constants
24b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  //    When we use them, in one case, they are all the same. In all others
25b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  //    it's a pair of them that we need to repeat four times. This is done
26b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  //    by constructing the 32 bit constant corresponding to that pair.
27b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
28b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
29b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
30b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
31b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
32b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
33b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
34b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i kOne = _mm_set1_epi16(1);
35b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i in0, in1, in2, in3;
36b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // Load inputs.
37b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  {
38b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
39b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
40b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in2  = _mm_loadl_epi64((const __m128i *)(input +  2 * stride));
41b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in3  = _mm_loadl_epi64((const __m128i *)(input +  3 * stride));
42b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    // x = x << 4
43b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in0 = _mm_slli_epi16(in0, 4);
44b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in1 = _mm_slli_epi16(in1, 4);
45b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in2 = _mm_slli_epi16(in2, 4);
46b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in3 = _mm_slli_epi16(in3, 4);
47b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    // if (i == 0 && input[0]) input[0] += 1;
48b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    {
49b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // The mask will only contain whether the first value is zero, all
50b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // other comparison will fail as something shifted by 4 (above << 4)
51b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // can never be equal to one. To increment in the non-zero case, we
52b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // add the mask and one for the first element:
53b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      //   - if zero, mask = -1, v = v - 1 + 1 = v
54b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      //   - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
55b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
56b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in0 = _mm_add_epi16(in0, mask);
57b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
58b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    }
59b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  }
60b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // Do the two transform/transpose passes
61b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  for (pass = 0; pass < 2; ++pass) {
62b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    // Transform 1/2: Add/subtract
63b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i r0 = _mm_add_epi16(in0, in3);
64b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i r1 = _mm_add_epi16(in1, in2);
65b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i r2 = _mm_sub_epi16(in1, in2);
66b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i r3 = _mm_sub_epi16(in0, in3);
67b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    // Transform 1/2: Interleave to do the multiply by constants which gets us
68b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    //                into 32 bits.
69b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
70b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
71b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
72b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
73b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
74b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
75b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
76b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
77b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
78b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
79b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
80b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
81b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
82b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
83b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    // Combine and transpose
84b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i res0 = _mm_packs_epi32(w0, w2);
85b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i res1 = _mm_packs_epi32(w4, w6);
86b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    // 00 01 02 03 20 21 22 23
87b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    // 10 11 12 13 30 31 32 33
88b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
89b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1);
90b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    // 00 10 01 11 02 12 03 13
91b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    // 20 30 21 31 22 32 23 33
92b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
93b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
94b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    // 00 10 20 30 01 11 21 31      in0 contains 0 followed by 1
95b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    // 02 12 22 32 03 13 23 33      in2 contains 2 followed by 3
96b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    if (0 == pass) {
97b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // Extract values in the high part for second pass as transform code
98b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // only uses the first four values.
99b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in1 = _mm_unpackhi_epi64(in0, in0);
100b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in3 = _mm_unpackhi_epi64(in2, in2);
101b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    } else {
102b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // Post-condition output and store it (v + 1) >> 2, taking advantage
103b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // of the fact 1/3 are stored just after 0/2.
104b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      __m128i out01 = _mm_add_epi16(in0, kOne);
105b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      __m128i out23 = _mm_add_epi16(in2, kOne);
106b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      out01 = _mm_srai_epi16(out01, 2);
107b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      out23 = _mm_srai_epi16(out23, 2);
108b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      _mm_storeu_si128((__m128i *)(output + 0 * 4), out01);
109b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      _mm_storeu_si128((__m128i *)(output + 2 * 4), out23);
110b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    }
111b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  }
112b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}
113b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
114b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic INLINE void load_buffer_4x4_avx2(const int16_t *input, __m128i *in,
115b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                   int stride) {
116b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
117b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
118b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i mask;
119b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
120b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
121b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
122b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
123b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
124b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
125b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[0] = _mm_slli_epi16(in[0], 4);
126b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[1] = _mm_slli_epi16(in[1], 4);
127b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[2] = _mm_slli_epi16(in[2], 4);
128b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[3] = _mm_slli_epi16(in[3], 4);
129b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
130b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
131b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[0] = _mm_add_epi16(in[0], mask);
132b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
133b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}
134b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
135b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic INLINE void write_buffer_4x4_avx2(int16_t *output, __m128i *res) {
136b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i kOne = _mm_set1_epi16(1);
137b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
138b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
139b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i out01 = _mm_add_epi16(in01, kOne);
140b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i out23 = _mm_add_epi16(in23, kOne);
141b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  out01 = _mm_srai_epi16(out01, 2);
142b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  out23 = _mm_srai_epi16(out23, 2);
143b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  _mm_store_si128((__m128i *)(output + 0 * 8), out01);
144b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  _mm_store_si128((__m128i *)(output + 1 * 8), out23);
145b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}
146b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
147b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic INLINE void transpose_4x4_avx2(__m128i *res) {
148b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // Combine and transpose
149b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // 00 01 02 03 20 21 22 23
150b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // 10 11 12 13 30 31 32 33
151b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
152b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
153b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
154b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // 00 10 01 11 02 12 03 13
155b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // 20 30 21 31 22 32 23 33
156b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
157b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
158b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
159b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // 00 10 20 30 01 11 21 31
160b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // 02 12 22 32 03 13 23 33
161b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // only use the first 4 16-bit integers
162b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[1] = _mm_unpackhi_epi64(res[0], res[0]);
163b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[3] = _mm_unpackhi_epi64(res[2], res[2]);
164b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}
165b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
166b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid fdct4_avx2(__m128i *in) {
167b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
168b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
169b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
170b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
171b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
172b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
173b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i u[4], v[4];
174b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[0]=_mm_unpacklo_epi16(in[0], in[1]);
175b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[1]=_mm_unpacklo_epi16(in[3], in[2]);
176b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
177b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  v[0] = _mm_add_epi16(u[0], u[1]);
178b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  v[1] = _mm_sub_epi16(u[0], u[1]);
179b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
180b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);  // 0
181b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16);  // 2
182b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24);  // 1
183b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08);  // 3
184b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
185b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
186b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
187b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
188b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
189b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
190b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
191b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
192b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
193b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
194b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[0] = _mm_packs_epi32(u[0], u[1]);
195b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[1] = _mm_packs_epi32(u[2], u[3]);
196b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  transpose_4x4_avx2(in);
197b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}
198b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
199b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid fadst4_avx2(__m128i *in) {
200b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
201b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
202b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
203b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
204b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
205b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i kZero = _mm_set1_epi16(0);
206b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
207b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i u[8], v[8];
208b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i in7 = _mm_add_epi16(in[0], in[1]);
209b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
210b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
211b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[1] = _mm_unpacklo_epi16(in[2], in[3]);
212b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[2] = _mm_unpacklo_epi16(in7, kZero);
213b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[3] = _mm_unpacklo_epi16(in[2], kZero);
214b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[4] = _mm_unpacklo_epi16(in[3], kZero);
215b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
216b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02);  // s0 + s2
217b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04);  // s4 + s5
218b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x1
219b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01);  // s1 - s3
220b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02);  // -s4 + s6
221b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s4
222b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
223b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
224b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[0] = _mm_add_epi32(v[0], v[1]);
225b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[1] = _mm_sub_epi32(v[2], v[6]);
226b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[2] = _mm_add_epi32(v[3], v[4]);
227b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[3] = _mm_sub_epi32(u[2], u[0]);
228b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[4] = _mm_slli_epi32(v[5], 2);
229b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[5] = _mm_sub_epi32(u[4], v[5]);
230b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[6] = _mm_add_epi32(u[3], u[5]);
231b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
232b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
233b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
234b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
235b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
236b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
237b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
238b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
239b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
240b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
241b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
242b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[0] = _mm_packs_epi32(u[0], u[2]);
243b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[1] = _mm_packs_epi32(u[1], u[3]);
244b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  transpose_4x4_avx2(in);
245b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}
246b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
247b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid vp9_fht4x4_avx2(const int16_t *input, int16_t *output,
248b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                     int stride, int tx_type) {
249b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i in[4];
250b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
251b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  switch (tx_type) {
252b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    case DCT_DCT:
253b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      vp9_fdct4x4_avx2(input, output, stride);
254b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      break;
255b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    case ADST_DCT:
256b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      load_buffer_4x4_avx2(input, in, stride);
257b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      fadst4_avx2(in);
258b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      fdct4_avx2(in);
259b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      write_buffer_4x4_avx2(output, in);
260b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      break;
261b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    case DCT_ADST:
262b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      load_buffer_4x4_avx2(input, in, stride);
263b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      fdct4_avx2(in);
264b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      fadst4_avx2(in);
265b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      write_buffer_4x4_avx2(output, in);
266b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      break;
267b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    case ADST_ADST:
268b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      load_buffer_4x4_avx2(input, in, stride);
269b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      fadst4_avx2(in);
270b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      fadst4_avx2(in);
271b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      write_buffer_4x4_avx2(output, in);
272b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      break;
273b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    default:
274b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      assert(0);
275b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      break;
276b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  }
277b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}
278b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
279b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid vp9_fdct8x8_avx2(const int16_t *input, int16_t *output, int stride) {
280b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  int pass;
281b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // Constants
282b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  //    When we use them, in one case, they are all the same. In all others
283b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  //    it's a pair of them that we need to repeat four times. This is done
284b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  //    by constructing the 32 bit constant corresponding to that pair.
285b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
286b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
287b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
288b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
289b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
290b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
291b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
292b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
293b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
294b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // Load input
295b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
296b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
297b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
298b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
299b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i in4  = _mm_load_si128((const __m128i *)(input + 4 * stride));
300b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i in5  = _mm_load_si128((const __m128i *)(input + 5 * stride));
301b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i in6  = _mm_load_si128((const __m128i *)(input + 6 * stride));
302b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i in7  = _mm_load_si128((const __m128i *)(input + 7 * stride));
303b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // Pre-condition input (shift by two)
304b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in0 = _mm_slli_epi16(in0, 2);
305b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in1 = _mm_slli_epi16(in1, 2);
306b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in2 = _mm_slli_epi16(in2, 2);
307b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in3 = _mm_slli_epi16(in3, 2);
308b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in4 = _mm_slli_epi16(in4, 2);
309b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in5 = _mm_slli_epi16(in5, 2);
310b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in6 = _mm_slli_epi16(in6, 2);
311b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in7 = _mm_slli_epi16(in7, 2);
312b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
313b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // We do two passes, first the columns, then the rows. The results of the
314b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // first pass are transposed so that the same column code can be reused. The
315b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // results of the second pass are also transposed so that the rows (processed
316b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // as columns) are put back in row positions.
317b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  for (pass = 0; pass < 2; pass++) {
318b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    // To store results of each pass before the transpose.
319b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
320b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    // Add/subtract
321b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i q0 = _mm_add_epi16(in0, in7);
322b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i q1 = _mm_add_epi16(in1, in6);
323b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i q2 = _mm_add_epi16(in2, in5);
324b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i q3 = _mm_add_epi16(in3, in4);
325b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i q4 = _mm_sub_epi16(in3, in4);
326b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i q5 = _mm_sub_epi16(in2, in5);
327b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i q6 = _mm_sub_epi16(in1, in6);
328b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i q7 = _mm_sub_epi16(in0, in7);
329b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    // Work on first four results
330b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    {
331b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // Add/subtract
332b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i r0 = _mm_add_epi16(q0, q3);
333b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i r1 = _mm_add_epi16(q1, q2);
334b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i r2 = _mm_sub_epi16(q1, q2);
335b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i r3 = _mm_sub_epi16(q0, q3);
336b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // Interleave to do the multiply by constants which gets us into 32bits
337b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
338b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
339b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
340b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
341b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
342b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
343b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
344b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
345b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
346b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
347b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
348b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
349b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // dct_const_round_shift
350b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
351b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
352b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
353b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
354b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
355b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
356b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
357b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
358b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
359b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
360b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
361b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
362b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
363b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
364b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
365b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
366b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // Combine
367b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      res0 = _mm_packs_epi32(w0, w1);
368b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      res4 = _mm_packs_epi32(w2, w3);
369b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      res2 = _mm_packs_epi32(w4, w5);
370b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      res6 = _mm_packs_epi32(w6, w7);
371b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    }
372b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    // Work on next four results
373b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    {
374b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // Interleave to do the multiply by constants which gets us into 32bits
375b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
376b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
377b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
378b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
379b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
380b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
381b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // dct_const_round_shift
382b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
383b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
384b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
385b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
386b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
387b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
388b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
389b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
390b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // Combine
391b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i r0 = _mm_packs_epi32(s0, s1);
392b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i r1 = _mm_packs_epi32(s2, s3);
393b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // Add/subtract
394b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i x0 = _mm_add_epi16(q4, r0);
395b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i x1 = _mm_sub_epi16(q4, r0);
396b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i x2 = _mm_sub_epi16(q7, r1);
397b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i x3 = _mm_add_epi16(q7, r1);
398b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // Interleave to do the multiply by constants which gets us into 32bits
399b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
400b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
401b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
402b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
403b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
404b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
405b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
406b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
407b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
408b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
409b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
410b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
411b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // dct_const_round_shift
412b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
413b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
414b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
415b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
416b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
417b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
418b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
419b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
420b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
421b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
422b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
423b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
424b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
425b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
426b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
427b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
428b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // Combine
429b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      res1 = _mm_packs_epi32(w0, w1);
430b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      res7 = _mm_packs_epi32(w2, w3);
431b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      res5 = _mm_packs_epi32(w4, w5);
432b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      res3 = _mm_packs_epi32(w6, w7);
433b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    }
434b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    // Transpose the 8x8.
435b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    {
436b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 00 01 02 03 04 05 06 07
437b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 10 11 12 13 14 15 16 17
438b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 20 21 22 23 24 25 26 27
439b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 30 31 32 33 34 35 36 37
440b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 40 41 42 43 44 45 46 47
441b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 50 51 52 53 54 55 56 57
442b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 60 61 62 63 64 65 66 67
443b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 70 71 72 73 74 75 76 77
444b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
445b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
446b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
447b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
448b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
449b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
450b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
451b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
452b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 00 10 01 11 02 12 03 13
453b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 20 30 21 31 22 32 23 33
454b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 04 14 05 15 06 16 07 17
455b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 24 34 25 35 26 36 27 37
456b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 40 50 41 51 42 52 43 53
457b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 60 70 61 71 62 72 63 73
458b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 54 54 55 55 56 56 57 57
459b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 64 74 65 75 66 76 67 77
460b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
461b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
462b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
463b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
464b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
465b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
466b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
467b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
468b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 00 10 20 30 01 11 21 31
469b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 40 50 60 70 41 51 61 71
470b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 02 12 22 32 03 13 23 33
471b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 42 52 62 72 43 53 63 73
472b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 04 14 24 34 05 15 21 36
473b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 44 54 64 74 45 55 61 76
474b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 06 16 26 36 07 17 27 37
475b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 46 56 66 76 47 57 67 77
476b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
477b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
478b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
479b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
480b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
481b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
482b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
483b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
484b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 00 10 20 30 40 50 60 70
485b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 01 11 21 31 41 51 61 71
486b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 02 12 22 32 42 52 62 72
487b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 03 13 23 33 43 53 63 73
488b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 04 14 24 34 44 54 64 74
489b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 05 15 25 35 45 55 65 75
490b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 06 16 26 36 46 56 66 76
491b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      // 07 17 27 37 47 57 67 77
492b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    }
493b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  }
494b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // Post-condition output and store it
495b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  {
496b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    // Post-condition (division by two)
497b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    //    division of two 16 bits signed numbers using shifts
498b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    //    n / 2 = (n - (n >> 15)) >> 1
499b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
500b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
501b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
502b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
503b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
504b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
505b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
506b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
507b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in0 = _mm_sub_epi16(in0, sign_in0);
508b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in1 = _mm_sub_epi16(in1, sign_in1);
509b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in2 = _mm_sub_epi16(in2, sign_in2);
510b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in3 = _mm_sub_epi16(in3, sign_in3);
511b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in4 = _mm_sub_epi16(in4, sign_in4);
512b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in5 = _mm_sub_epi16(in5, sign_in5);
513b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in6 = _mm_sub_epi16(in6, sign_in6);
514b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in7 = _mm_sub_epi16(in7, sign_in7);
515b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in0 = _mm_srai_epi16(in0, 1);
516b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in1 = _mm_srai_epi16(in1, 1);
517b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in2 = _mm_srai_epi16(in2, 1);
518b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in3 = _mm_srai_epi16(in3, 1);
519b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in4 = _mm_srai_epi16(in4, 1);
520b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in5 = _mm_srai_epi16(in5, 1);
521b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in6 = _mm_srai_epi16(in6, 1);
522b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    in7 = _mm_srai_epi16(in7, 1);
523b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    // store results
524b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    _mm_store_si128((__m128i *)(output + 0 * 8), in0);
525b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    _mm_store_si128((__m128i *)(output + 1 * 8), in1);
526b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    _mm_store_si128((__m128i *)(output + 2 * 8), in2);
527b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    _mm_store_si128((__m128i *)(output + 3 * 8), in3);
528b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    _mm_store_si128((__m128i *)(output + 4 * 8), in4);
529b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    _mm_store_si128((__m128i *)(output + 5 * 8), in5);
530b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    _mm_store_si128((__m128i *)(output + 6 * 8), in6);
531b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    _mm_store_si128((__m128i *)(output + 7 * 8), in7);
532b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  }
533b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}
534b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
535b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian// load 8x8 array
536b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic INLINE void load_buffer_8x8_avx2(const int16_t *input, __m128i *in,
537b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                   int stride) {
538b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * stride));
539b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * stride));
540b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * stride));
541b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * stride));
542b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * stride));
543b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * stride));
544b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * stride));
545b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * stride));
546b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
547b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[0] = _mm_slli_epi16(in[0], 2);
548b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[1] = _mm_slli_epi16(in[1], 2);
549b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[2] = _mm_slli_epi16(in[2], 2);
550b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[3] = _mm_slli_epi16(in[3], 2);
551b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[4] = _mm_slli_epi16(in[4], 2);
552b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[5] = _mm_slli_epi16(in[5], 2);
553b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[6] = _mm_slli_epi16(in[6], 2);
554b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  in[7] = _mm_slli_epi16(in[7], 2);
555b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}
556b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
557b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian// right shift and rounding
558b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic INLINE void right_shift_8x8_avx2(__m128i *res, int const bit) {
559b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i kOne = _mm_set1_epi16(1);
560b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const int bit_m02 = bit - 2;
561b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i sign0 = _mm_srai_epi16(res[0], 15);
562b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i sign1 = _mm_srai_epi16(res[1], 15);
563b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i sign2 = _mm_srai_epi16(res[2], 15);
564b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i sign3 = _mm_srai_epi16(res[3], 15);
565b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i sign4 = _mm_srai_epi16(res[4], 15);
566b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i sign5 = _mm_srai_epi16(res[5], 15);
567b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i sign6 = _mm_srai_epi16(res[6], 15);
568b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i sign7 = _mm_srai_epi16(res[7], 15);
569b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
570b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  if (bit_m02 >= 0) {
571b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02);
572b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    res[0] = _mm_add_epi16(res[0], k_const_rounding);
573b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    res[1] = _mm_add_epi16(res[1], k_const_rounding);
574b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    res[2] = _mm_add_epi16(res[2], k_const_rounding);
575b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    res[3] = _mm_add_epi16(res[3], k_const_rounding);
576b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    res[4] = _mm_add_epi16(res[4], k_const_rounding);
577b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    res[5] = _mm_add_epi16(res[5], k_const_rounding);
578b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    res[6] = _mm_add_epi16(res[6], k_const_rounding);
579b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    res[7] = _mm_add_epi16(res[7], k_const_rounding);
580b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  }
581b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
582b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[0] = _mm_sub_epi16(res[0], sign0);
583b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[1] = _mm_sub_epi16(res[1], sign1);
584b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[2] = _mm_sub_epi16(res[2], sign2);
585b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[3] = _mm_sub_epi16(res[3], sign3);
586b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[4] = _mm_sub_epi16(res[4], sign4);
587b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[5] = _mm_sub_epi16(res[5], sign5);
588b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[6] = _mm_sub_epi16(res[6], sign6);
589b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[7] = _mm_sub_epi16(res[7], sign7);
590b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
591b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[0] = _mm_srai_epi16(res[0], bit);
592b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[1] = _mm_srai_epi16(res[1], bit);
593b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[2] = _mm_srai_epi16(res[2], bit);
594b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[3] = _mm_srai_epi16(res[3], bit);
595b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[4] = _mm_srai_epi16(res[4], bit);
596b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[5] = _mm_srai_epi16(res[5], bit);
597b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[6] = _mm_srai_epi16(res[6], bit);
598b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[7] = _mm_srai_epi16(res[7], bit);
599b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}
600b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
601b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian// write 8x8 array
602b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic INLINE void write_buffer_8x8_avx2(int16_t *output, __m128i *res, int stride) {
603b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  _mm_store_si128((__m128i *)(output + 0 * stride), res[0]);
604b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  _mm_store_si128((__m128i *)(output + 1 * stride), res[1]);
605b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  _mm_store_si128((__m128i *)(output + 2 * stride), res[2]);
606b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  _mm_store_si128((__m128i *)(output + 3 * stride), res[3]);
607b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  _mm_store_si128((__m128i *)(output + 4 * stride), res[4]);
608b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  _mm_store_si128((__m128i *)(output + 5 * stride), res[5]);
609b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  _mm_store_si128((__m128i *)(output + 6 * stride), res[6]);
610b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  _mm_store_si128((__m128i *)(output + 7 * stride), res[7]);
611b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}
612b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
613b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian// perform in-place transpose
614b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic INLINE void array_transpose_8x8_avx2(__m128i *in, __m128i *res) {
615b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
616b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
617b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
618b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
619b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
620b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
621b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
622b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
623b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // 00 10 01 11 02 12 03 13
624b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // 20 30 21 31 22 32 23 33
625b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // 04 14 05 15 06 16 07 17
626b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // 24 34 25 35 26 36 27 37
627b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // 40 50 41 51 42 52 43 53
628b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // 60 70 61 71 62 72 63 73
629b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // 44 54 45 55 46 56 47 57
630b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // 64 74 65 75 66 76 67 77
631b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
632b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
633b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
634b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
635b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
636b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
637b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
638b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
639b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // 00 10 20 30 01 11 21 31
640b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // 40 50 60 70 41 51 61 71
641b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // 02 12 22 32 03 13 23 33
642b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // 42 52 62 72 43 53 63 73
643b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // 04 14 24 34 05 15 25 35
644b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // 44 54 64 74 45 55 65 75
645b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // 06 16 26 36 07 17 27 37
646b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // 46 56 66 76 47 57 67 77
647b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
648b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
649b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
650b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
651b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
652b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
653b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
654b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
655b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // 00 10 20 30 40 50 60 70
656b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // 01 11 21 31 41 51 61 71
657b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // 02 12 22 32 42 52 62 72
658b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // 03 13 23 33 43 53 63 73
659b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // 04 14 24 34 44 54 64 74
660b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // 05 15 25 35 45 55 65 75
661b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // 06 16 26 36 46 56 66 76
662b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // 07 17 27 37 47 57 67 77
663b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}
664b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
665b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid fdct8_avx2(__m128i *in) {
666b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // constants
667b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
668b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
669b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
670b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
671b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
672b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
673b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
674b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
675b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
676b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
677b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
678b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
679b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
680b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // stage 1
681b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  s0 = _mm_add_epi16(in[0], in[7]);
682b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  s1 = _mm_add_epi16(in[1], in[6]);
683b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  s2 = _mm_add_epi16(in[2], in[5]);
684b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  s3 = _mm_add_epi16(in[3], in[4]);
685b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  s4 = _mm_sub_epi16(in[3], in[4]);
686b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  s5 = _mm_sub_epi16(in[2], in[5]);
687b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  s6 = _mm_sub_epi16(in[1], in[6]);
688b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  s7 = _mm_sub_epi16(in[0], in[7]);
689b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
690b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u0 = _mm_add_epi16(s0, s3);
691b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u1 = _mm_add_epi16(s1, s2);
692b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u2 = _mm_sub_epi16(s1, s2);
693b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u3 = _mm_sub_epi16(s0, s3);
694b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  // interleave and perform butterfly multiplication/addition
695b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  v0 = _mm_unpacklo_epi16(u0, u1);
696b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  v1 = _mm_unpackhi_epi16(u0, u1);
697b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  v2 = _mm_unpacklo_epi16(u2, u3);
698b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  v3 = _mm_unpackhi_epi16(u2, u3);
699b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
700b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);