13f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org/*
23f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
33f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org *
43f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org *  Use of this source code is governed by a BSD-style license
53f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org *  that can be found in the LICENSE file in the root of the source
63f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org *  tree. An additional intellectual property rights grant can be found
73f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org *  in the file PATENTS.  All contributing project authors may
83f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org *  be found in the AUTHORS file in the root of the source tree.
93f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org */
103f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org
113f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org#include <emmintrin.h>  // SSE2
123f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org#include "vp9/common/vp9_idct.h"  // for cospi constants
1347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org#include "vpx_ports/mem.h"
143f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org
15d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org#include "vp9/common/x86/vp9_idct_intrin_sse2.h"
16d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
1788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.orgvoid vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride) {
1888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  __m128i in0, in1;
1988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  __m128i tmp;
2088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  const __m128i zero = _mm_setzero_si128();
2188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
2288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
2388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1  = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
2488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org         (input +  2 * stride)));
2588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0  = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
2688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org         (input +  3 * stride)));
2788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
2888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  tmp = _mm_add_epi16(in0, in1);
2988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_unpacklo_epi16(zero, tmp);
3088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1 = _mm_unpackhi_epi16(zero, tmp);
3188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_srai_epi32(in0, 16);
3288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1 = _mm_srai_epi32(in1, 16);
3388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
3488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  tmp = _mm_add_epi32(in0, in1);
3588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_unpacklo_epi32(tmp, zero);
3688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1 = _mm_unpackhi_epi32(tmp, zero);
3788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
3888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  tmp = _mm_add_epi32(in0, in1);
3988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_srli_si128(tmp, 8);
4088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
4188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1 = _mm_add_epi32(tmp, in0);
4288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_slli_epi32(in1, 1);
4388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  _mm_store_si128((__m128i *)(output), in0);
4488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org}
4588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
46ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgvoid vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
4793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // This 2D transform implements 4 vertical 1D transforms followed
4893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // by 4 horizontal 1D transforms.  The multiplies and adds are as given
4993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // by Chen, Smith and Fralick ('77).  The commands for moving the data
5093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // around have been minimized by hand.
5193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // For the purposes of the comments, the 16 inputs are referred to at i0
5293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // through iF (in raster order), intermediate variables are a0, b0, c0
5393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // through f, and correspond to the in-place computations mapped to input
5493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // locations.  The outputs, o0 through oF are labeled according to the
5593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // output locations.
5693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
573f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // Constants
5893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // These are the coefficients used for the multiplies.
5993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
6093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // where cospi_N_64 = cos(N pi /64)
6193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  const __m128i k__cospi_A = _mm_setr_epi16(cospi_16_64, cospi_16_64,
6293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_16_64, cospi_16_64,
6393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_16_64, -cospi_16_64,
6493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_16_64, -cospi_16_64);
6593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  const __m128i k__cospi_B = _mm_setr_epi16(cospi_16_64, -cospi_16_64,
6693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_16_64, -cospi_16_64,
6793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_16_64, cospi_16_64,
6893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_16_64, cospi_16_64);
6993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  const __m128i k__cospi_C = _mm_setr_epi16(cospi_8_64, cospi_24_64,
7093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_8_64, cospi_24_64,
7193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_24_64, -cospi_8_64,
7293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_24_64, -cospi_8_64);
7393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  const __m128i k__cospi_D = _mm_setr_epi16(cospi_24_64, -cospi_8_64,
7493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_24_64, -cospi_8_64,
7593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_8_64, cospi_24_64,
7693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_8_64, cospi_24_64);
7793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  const __m128i k__cospi_E = _mm_setr_epi16(cospi_16_64, cospi_16_64,
7893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_16_64, cospi_16_64,
7993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_16_64, cospi_16_64,
8093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_16_64, cospi_16_64);
8193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  const __m128i k__cospi_F = _mm_setr_epi16(cospi_16_64, -cospi_16_64,
8293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_16_64, -cospi_16_64,
8393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_16_64, -cospi_16_64,
8493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_16_64, -cospi_16_64);
8593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  const __m128i k__cospi_G = _mm_setr_epi16(cospi_8_64, cospi_24_64,
8693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_8_64, cospi_24_64,
8793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            -cospi_8_64, -cospi_24_64,
8893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            -cospi_8_64, -cospi_24_64);
8993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  const __m128i k__cospi_H = _mm_setr_epi16(cospi_24_64, -cospi_8_64,
9093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_24_64, -cospi_8_64,
9193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            -cospi_24_64, cospi_8_64,
9293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            -cospi_24_64, cospi_8_64);
9393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
943f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
9593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // This second rounding constant saves doing some extra adds at the end
9693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  const __m128i k__DCT_CONST_ROUNDING2 = _mm_set1_epi32(DCT_CONST_ROUNDING
9793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                               +(DCT_CONST_ROUNDING << 1));
9893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  const int DCT_CONST_BITS2 =  DCT_CONST_BITS+2;
993f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
1003f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
101d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  __m128i in0, in1;
10293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
1033f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // Load inputs.
1043f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  {
1053f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
10693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
10793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    in1  = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
10893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org           (input +  2 * stride)));
109d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    in0  = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
11093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org           (input +  3 * stride)));
11193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // in0 = [i0 i1 i2 i3 iC iD iE iF]
11293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
113d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
11493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
11593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // multiply by 16 to give some extra precision
1163f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in0 = _mm_slli_epi16(in0, 4);
1173f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in1 = _mm_slli_epi16(in1, 4);
1183f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    // if (i == 0 && input[0]) input[0] += 1;
11993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // add 1 to the upper left pixel if it is non-zero, which helps reduce
12093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // the round-trip error
1213f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    {
122411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org      // The mask will only contain whether the first value is zero, all
1233f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // other comparison will fail as something shifted by 4 (above << 4)
1243f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // can never be equal to one. To increment in the non-zero case, we
1253f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // add the mask and one for the first element:
1263f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      //   - if zero, mask = -1, v = v - 1 + 1 = v
1273f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      //   - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
1283f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
1293f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      in0 = _mm_add_epi16(in0, mask);
1303f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
1313f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    }
1323f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  }
13393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // There are 4 total stages, alternating between an add/subtract stage
13493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // followed by an multiply-and-add stage.
13593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  {
13693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // Stage 1: Add/subtract
13793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
13893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // in0 = [i0 i1 i2 i3 iC iD iE iF]
13993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
14093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i r0 = _mm_unpacklo_epi16(in0, in1);
14193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i r1 = _mm_unpackhi_epi16(in0, in1);
14293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // r0 = [i0 i4 i1 i5 i2 i6 i3 i7]
14393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // r1 = [iC i8 iD i9 iE iA iF iB]
14493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4);
14593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4);
14693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // r2 = [i0 i4 i1 i5 i3 i7 i2 i6]
14793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // r3 = [iC i8 iD i9 iF iB iE iA]
14893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
14993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i t0 = _mm_add_epi16(r2, r3);
15093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i t1 = _mm_sub_epi16(r2, r3);
15193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // t0 = [a0 a4 a1 a5 a3 a7 a2 a6]
15293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // t1 = [aC a8 aD a9 aF aB aE aA]
15393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
15493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // Stage 2: multiply by constants (which gets us into 32 bits).
15593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // The constants needed here are:
15693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16]
15793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16]
15893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08]
15993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24]
16093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A);
16193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B);
16293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C);
16393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D);
16493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // Then add and right-shift to get back to 16-bit range
1653f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
16693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
1673f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
16893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
1693f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
17093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
1713f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
17293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
17393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // w0 = [b0 b1 b7 b6]
17493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // w1 = [b8 b9 bF bE]
17593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // w2 = [b4 b5 b3 b2]
17693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // w3 = [bC bD bB bA]
17793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i x0 = _mm_packs_epi32(w0, w1);
17893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i x1 = _mm_packs_epi32(w2, w3);
17993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // x0 = [b0 b1 b7 b6 b8 b9 bF bE]
18093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // x1 = [b4 b5 b3 b2 bC bD bB bA]
18193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    in0 = _mm_shuffle_epi32(x0, 0xD8);
18293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    in1 = _mm_shuffle_epi32(x1, 0x8D);
18393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // in0 = [b0 b1 b8 b9 b7 b6 bF bE]
18493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // in1 = [b3 b2 bB bA b4 b5 bC bD]
185d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  }
186d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  {
18793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // vertical DCTs finished. Now we do the horizontal DCTs.
18893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // Stage 3: Add/subtract
18993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
19093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i t0 = _mm_add_epi16(in0, in1);
19193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i t1 = _mm_sub_epi16(in0, in1);
19293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // t0 = [c0 c1 c8 c9  c4  c5  cC  cD]
19393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE]
19493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
19593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // Stage 4: multiply by constants (which gets us into 32 bits).
19693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // The constants needed here are:
19793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16]
19893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16]
19993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24]
20093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08]
20193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E);
20293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F);
20393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G);
20493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H);
20593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // Then add and right-shift to get back to 16-bit range
20693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // but this combines the final right-shift as well to save operations
20793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // This unusual rounding operations is to maintain bit-accurate
20893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // compatibility with the c version of this function which has two
20993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // rounding steps in a row.
21093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2);
21193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2);
21293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2);
21393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2);
21493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2);
21593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2);
21693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2);
21793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2);
21893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // w0 = [o0 o4 o8 oC]
21993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // w1 = [o2 o6 oA oE]
22093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // w2 = [o1 o5 o9 oD]
22193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // w3 = [o3 o7 oB oF]
22293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // remember the o's are numbered according to the correct output location
22393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i x0 = _mm_packs_epi32(w0, w1);
22493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i x1 = _mm_packs_epi32(w2, w3);
22593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // x0 = [o0 o4 o8 oC o2 o6 oA oE]
22693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // x1 = [o1 o5 o9 oD o3 o7 oB oF]
22793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i y0 = _mm_unpacklo_epi16(x0, x1);
22893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i y1 = _mm_unpackhi_epi16(x0, x1);
22993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // y0 = [o0 o1 o4 o5 o8 o9 oC oD]
23093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // y1 = [o2 o3 o6 o7 oA oB oE oF]
23193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    in0 = _mm_unpacklo_epi32(y0, y1);
23293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // in0 = [o0 o1 o2 o3 o4 o5 o6 o7]
23393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    in1 = _mm_unpackhi_epi32(y0, y1);
23493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // in1 = [o8 o9 oA oB oC oD oE oF]
23593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  }
23693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // Post-condition (v + 1) >> 2 is now incorporated into previous
23793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // add and right-shift commands.  Only 2 store instructions needed
23893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // because we are using the fact that 1/3 are stored just after 0/2.
23993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  {
24093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org     _mm_storeu_si128((__m128i *)(output + 0 * 4), in0);
24193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org     _mm_storeu_si128((__m128i *)(output + 2 * 4), in1);
2423f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  }
2433f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org}
2443f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org
24593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
246ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgstatic INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
247ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org                                   int stride) {
24847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
24947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
25047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i mask;
25147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
25247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
25347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
25447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
25547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
25647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
25747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[0] = _mm_slli_epi16(in[0], 4);
25847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[1] = _mm_slli_epi16(in[1], 4);
25947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[2] = _mm_slli_epi16(in[2], 4);
26047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[3] = _mm_slli_epi16(in[3], 4);
26147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
26247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
26347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[0] = _mm_add_epi16(in[0], mask);
26447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
26547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
26647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
26747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.orgstatic INLINE void write_buffer_4x4(int16_t *output, __m128i *res) {
26847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i kOne = _mm_set1_epi16(1);
26947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
27047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
27147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i out01 = _mm_add_epi16(in01, kOne);
27247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i out23 = _mm_add_epi16(in23, kOne);
27347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  out01 = _mm_srai_epi16(out01, 2);
27447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  out23 = _mm_srai_epi16(out23, 2);
27547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  _mm_store_si128((__m128i *)(output + 0 * 8), out01);
27647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  _mm_store_si128((__m128i *)(output + 1 * 8), out23);
27747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
27847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
27947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.orgstatic INLINE void transpose_4x4(__m128i *res) {
28047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // Combine and transpose
28147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // 00 01 02 03 20 21 22 23
28247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // 10 11 12 13 30 31 32 33
28347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
28447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
28547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
28647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // 00 10 01 11 02 12 03 13
28747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // 20 30 21 31 22 32 23 33
28847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
28947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
29047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
29147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // 00 10 20 30 01 11 21 31
29247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // 02 12 22 32 03 13 23 33
29347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // only use the first 4 16-bit integers
29447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[1] = _mm_unpackhi_epi64(res[0], res[0]);
29547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[3] = _mm_unpackhi_epi64(res[2], res[2]);
29647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
29747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
298dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.orgvoid fdct4_sse2(__m128i *in) {
29947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
30047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
301ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
302ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
30347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
30447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
30547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i u[4], v[4];
306ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  u[0]=_mm_unpacklo_epi16(in[0], in[1]);
307ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  u[1]=_mm_unpacklo_epi16(in[3], in[2]);
308ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org
309ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  v[0] = _mm_add_epi16(u[0], u[1]);
310ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  v[1] = _mm_sub_epi16(u[0], u[1]);
31147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
31247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);  // 0
31347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16);  // 2
314ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24);  // 1
315ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08);  // 3
31647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
31747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
31847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
31947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
32047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
32147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
32247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
32347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
32447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
32547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
32647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[0] = _mm_packs_epi32(u[0], u[1]);
32747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[1] = _mm_packs_epi32(u[2], u[3]);
32847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  transpose_4x4(in);
32947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
33047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
331dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.orgvoid fadst4_sse2(__m128i *in) {
33247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
33347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
33447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
33547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
33647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
33747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i kZero = _mm_set1_epi16(0);
33847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
33947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i u[8], v[8];
34047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i in7 = _mm_add_epi16(in[0], in[1]);
34147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
34247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
34347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_unpacklo_epi16(in[2], in[3]);
34447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[2] = _mm_unpacklo_epi16(in7, kZero);
34547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[3] = _mm_unpacklo_epi16(in[2], kZero);
346d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  u[4] = _mm_unpacklo_epi16(in[3], kZero);
34747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
34847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02);  // s0 + s2
34947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04);  // s4 + s5
35047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x1
35147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01);  // s1 - s3
35247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02);  // -s4 + s6
35347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s4
354d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
35547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
35647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_add_epi32(v[0], v[1]);
357d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  u[1] = _mm_sub_epi32(v[2], v[6]);
35847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[2] = _mm_add_epi32(v[3], v[4]);
35947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[3] = _mm_sub_epi32(u[2], u[0]);
36047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[4] = _mm_slli_epi32(v[5], 2);
36147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[5] = _mm_sub_epi32(u[4], v[5]);
36247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[6] = _mm_add_epi32(u[3], u[5]);
36347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
36447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
36547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
36647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
36747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
36847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
36947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
37047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
37147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
37247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
37347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
37447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[0] = _mm_packs_epi32(u[0], u[2]);
37547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[1] = _mm_packs_epi32(u[1], u[3]);
37647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  transpose_4x4(in);
37747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
37847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
37976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.orgvoid vp9_fht4x4_sse2(const int16_t *input, int16_t *output,
38076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                     int stride, int tx_type) {
38147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i in[4];
38276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
38347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  switch (tx_type) {
38476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    case DCT_DCT:
38576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      vp9_fdct4x4_sse2(input, output, stride);
38647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org      break;
38776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    case ADST_DCT:
38876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      load_buffer_4x4(input, in, stride);
389dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      fadst4_sse2(in);
390dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      fdct4_sse2(in);
39176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      write_buffer_4x4(output, in);
39247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org      break;
39376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    case DCT_ADST:
39476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      load_buffer_4x4(input, in, stride);
395dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      fdct4_sse2(in);
396dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      fadst4_sse2(in);
39776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      write_buffer_4x4(output, in);
39847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org      break;
39976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    case ADST_ADST:
40076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      load_buffer_4x4(input, in, stride);
401dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      fadst4_sse2(in);
402dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      fadst4_sse2(in);
40376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      write_buffer_4x4(output, in);
40447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org      break;
40576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org   default:
40676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     assert(0);
40776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     break;
40847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  }
40947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
41047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
41188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.orgvoid vp9_fdct8x8_1_sse2(const int16_t *input, int16_t *output, int stride) {
41288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
41388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
41488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
41588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
41688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  __m128i u0, u1, sum;
41788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
41888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  u0 = _mm_add_epi16(in0, in1);
41988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  u1 = _mm_add_epi16(in2, in3);
42088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
42188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0  = _mm_load_si128((const __m128i *)(input + 4 * stride));
42288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1  = _mm_load_si128((const __m128i *)(input + 5 * stride));
42388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in2  = _mm_load_si128((const __m128i *)(input + 6 * stride));
42488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in3  = _mm_load_si128((const __m128i *)(input + 7 * stride));
42588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
42688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  sum = _mm_add_epi16(u0, u1);
42788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
42888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_add_epi16(in0, in1);
42988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in2 = _mm_add_epi16(in2, in3);
43088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  sum = _mm_add_epi16(sum, in0);
43188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
43288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  u0  = _mm_setzero_si128();
43388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  sum = _mm_add_epi16(sum, in2);
43488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
43588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_unpacklo_epi16(u0, sum);
43688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1 = _mm_unpackhi_epi16(u0, sum);
43788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_srai_epi32(in0, 16);
43888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1 = _mm_srai_epi32(in1, 16);
43988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
44088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  sum = _mm_add_epi32(in0, in1);
44188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_unpacklo_epi32(sum, u0);
44288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1 = _mm_unpackhi_epi32(sum, u0);
44388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
44488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  sum = _mm_add_epi32(in0, in1);
44588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_srli_si128(sum, 8);
44688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
44788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1 = _mm_add_epi32(sum, in0);
44888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  _mm_store_si128((__m128i *)(output), in1);
44988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org}
45088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
451ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgvoid vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
4523f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  int pass;
4533f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // Constants
4543f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  //    When we use them, in one case, they are all the same. In all others
4553f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  //    it's a pair of them that we need to repeat four times. This is done
4563f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  //    by constructing the 32 bit constant corresponding to that pair.
4573f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
4583f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
4593f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
4603f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
4613f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
4623f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
4633f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
4643f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
4653f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
4663f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // Load input
46747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
46847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
46947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
47047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
47147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i in4  = _mm_load_si128((const __m128i *)(input + 4 * stride));
47247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i in5  = _mm_load_si128((const __m128i *)(input + 5 * stride));
47347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i in6  = _mm_load_si128((const __m128i *)(input + 6 * stride));
47447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i in7  = _mm_load_si128((const __m128i *)(input + 7 * stride));
4753f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // Pre-condition input (shift by two)
4763f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  in0 = _mm_slli_epi16(in0, 2);
4773f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  in1 = _mm_slli_epi16(in1, 2);
4783f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  in2 = _mm_slli_epi16(in2, 2);
4793f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  in3 = _mm_slli_epi16(in3, 2);
4803f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  in4 = _mm_slli_epi16(in4, 2);
4813f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  in5 = _mm_slli_epi16(in5, 2);
4823f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  in6 = _mm_slli_epi16(in6, 2);
4833f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  in7 = _mm_slli_epi16(in7, 2);
4843f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org
4853f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // We do two passes, first the columns, then the rows. The results of the
4863f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // first pass are transposed so that the same column code can be reused. The
4873f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // results of the second pass are also transposed so that the rows (processed
4883f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // as columns) are put back in row positions.
4893f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  for (pass = 0; pass < 2; pass++) {
4903f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    // To store results of each pass before the transpose.
4913f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
492411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // Add/subtract
4933f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i q0 = _mm_add_epi16(in0, in7);
4943f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i q1 = _mm_add_epi16(in1, in6);
4953f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i q2 = _mm_add_epi16(in2, in5);
4963f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i q3 = _mm_add_epi16(in3, in4);
4973f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i q4 = _mm_sub_epi16(in3, in4);
4983f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i q5 = _mm_sub_epi16(in2, in5);
4993f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i q6 = _mm_sub_epi16(in1, in6);
5003f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i q7 = _mm_sub_epi16(in0, in7);
5013f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    // Work on first four results
5023f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    {
503411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org      // Add/subtract
5043f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i r0 = _mm_add_epi16(q0, q3);
5053f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i r1 = _mm_add_epi16(q1, q2);
5063f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i r2 = _mm_sub_epi16(q1, q2);
5073f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i r3 = _mm_sub_epi16(q0, q3);
5083f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // Interleave to do the multiply by constants which gets us into 32bits
5093f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
5103f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
5113f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
5123f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
5133f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
5143f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
5153f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
5163f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
5173f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
5183f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
5193f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
5203f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
5213f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // dct_const_round_shift
5223f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
5233f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
5243f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
5253f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
5263f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
5273f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
5283f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
5293f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
5303f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
5313f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
5323f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
5333f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
5343f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
5353f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
5363f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
5373f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
5383f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // Combine
5393f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      res0 = _mm_packs_epi32(w0, w1);
5403f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      res4 = _mm_packs_epi32(w2, w3);
5413f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      res2 = _mm_packs_epi32(w4, w5);
5423f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      res6 = _mm_packs_epi32(w6, w7);
5433f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    }
5443f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    // Work on next four results
5453f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    {
5463f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // Interleave to do the multiply by constants which gets us into 32bits
5473f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
5483f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
5493f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
5503f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
5513f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
5523f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
5533f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // dct_const_round_shift
5543f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
5553f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
5563f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
5573f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
5583f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
5593f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
5603f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
5613f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
5623f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // Combine
5633f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i r0 = _mm_packs_epi32(s0, s1);
5643f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i r1 = _mm_packs_epi32(s2, s3);
565411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org      // Add/subtract
5663f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i x0 = _mm_add_epi16(q4, r0);
5673f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i x1 = _mm_sub_epi16(q4, r0);
5683f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i x2 = _mm_sub_epi16(q7, r1);
5693f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i x3 = _mm_add_epi16(q7, r1);
5703f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // Interleave to do the multiply by constants which gets us into 32bits
5713f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
5723f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
5733f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
5743f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
5753f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
5763f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
5773f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
5783f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
5793f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
5803f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
5813f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
5823f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
5833f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // dct_const_round_shift
5843f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
5853f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
5863f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
5873f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
5883f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
5893f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
5903f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
5913f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
5923f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
5933f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
5943f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
5953f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
5963f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
5973f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
5983f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
5993f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
6003f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // Combine
6013f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      res1 = _mm_packs_epi32(w0, w1);
6023f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      res7 = _mm_packs_epi32(w2, w3);
6033f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      res5 = _mm_packs_epi32(w4, w5);
6043f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      res3 = _mm_packs_epi32(w6, w7);
6053f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    }
6063f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    // Transpose the 8x8.
6073f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    {
6083f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 00 01 02 03 04 05 06 07
6093f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 10 11 12 13 14 15 16 17
6103f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 20 21 22 23 24 25 26 27
6113f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 30 31 32 33 34 35 36 37
6123f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 40 41 42 43 44 45 46 47
6133f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 50 51 52 53 54 55 56 57
6143f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 60 61 62 63 64 65 66 67
6153f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 70 71 72 73 74 75 76 77
6163f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
6173f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
6183f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
6193f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
6203f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
6213f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
6223f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
6233f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
6243f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 00 10 01 11 02 12 03 13
6253f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 20 30 21 31 22 32 23 33
6263f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 04 14 05 15 06 16 07 17
6273f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 24 34 25 35 26 36 27 37
6283f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 40 50 41 51 42 52 43 53
6293f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 60 70 61 71 62 72 63 73
6303f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 54 54 55 55 56 56 57 57
6313f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 64 74 65 75 66 76 67 77
6323f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
6333f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
6343f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
6353f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
6363f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
6373f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
6383f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
6393f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
6403f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 00 10 20 30 01 11 21 31
6413f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 40 50 60 70 41 51 61 71
6423f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 02 12 22 32 03 13 23 33
6433f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 42 52 62 72 43 53 63 73
6443f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 04 14 24 34 05 15 21 36
6453f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 44 54 64 74 45 55 61 76
6463f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 06 16 26 36 07 17 27 37
6473f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 46 56 66 76 47 57 67 77
6483f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
6493f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
6503f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
6513f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
6523f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
6533f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
6543f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
6553f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
6563f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 00 10 20 30 40 50 60 70
6573f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 01 11 21 31 41 51 61 71
6583f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 02 12 22 32 42 52 62 72
6593f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 03 13 23 33 43 53 63 73
6603f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 04 14 24 34 44 54 64 74
6613f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 05 15 25 35 45 55 65 75
6623f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 06 16 26 36 46 56 66 76
6633f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 07 17 27 37 47 57 67 77
6643f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    }
6653f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  }
6663f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // Post-condition output and store it
6673f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  {
6683f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    // Post-condition (division by two)
6693f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    //    division of two 16 bits signed numbers using shifts
6703f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    //    n / 2 = (n - (n >> 15)) >> 1
6713f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
6723f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
6733f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
6743f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
6753f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
6763f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
6773f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
6783f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
6793f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in0 = _mm_sub_epi16(in0, sign_in0);
6803f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in1 = _mm_sub_epi16(in1, sign_in1);
6813f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in2 = _mm_sub_epi16(in2, sign_in2);
6823f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in3 = _mm_sub_epi16(in3, sign_in3);
6833f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in4 = _mm_sub_epi16(in4, sign_in4);
6843f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in5 = _mm_sub_epi16(in5, sign_in5);
6853f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in6 = _mm_sub_epi16(in6, sign_in6);
6863f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in7 = _mm_sub_epi16(in7, sign_in7);
6873f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in0 = _mm_srai_epi16(in0, 1);
6883f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in1 = _mm_srai_epi16(in1, 1);
6893f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in2 = _mm_srai_epi16(in2, 1);
6903f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in3 = _mm_srai_epi16(in3, 1);
6913f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in4 = _mm_srai_epi16(in4, 1);
6923f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in5 = _mm_srai_epi16(in5, 1);
6933f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in6 = _mm_srai_epi16(in6, 1);
6943f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in7 = _mm_srai_epi16(in7, 1);
6953f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    // store results
69647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    _mm_store_si128((__m128i *)(output + 0 * 8), in0);
69747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    _mm_store_si128((__m128i *)(output + 1 * 8), in1);
69847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    _mm_store_si128((__m128i *)(output + 2 * 8), in2);
69947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    _mm_store_si128((__m128i *)(output + 3 * 8), in3);
70047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    _mm_store_si128((__m128i *)(output + 4 * 8), in4);
70147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    _mm_store_si128((__m128i *)(output + 5 * 8), in5);
70247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    _mm_store_si128((__m128i *)(output + 6 * 8), in6);
70347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    _mm_store_si128((__m128i *)(output + 7 * 8), in7);
70447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  }
70547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
70647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
70747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org// load 8x8 array
708ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgstatic INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
709ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org                                   int stride) {
710ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * stride));
711ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * stride));
712ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * stride));
713ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * stride));
714ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * stride));
715ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * stride));
716ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * stride));
717ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * stride));
71847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
71947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[0] = _mm_slli_epi16(in[0], 2);
72047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[1] = _mm_slli_epi16(in[1], 2);
72147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[2] = _mm_slli_epi16(in[2], 2);
72247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[3] = _mm_slli_epi16(in[3], 2);
72347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[4] = _mm_slli_epi16(in[4], 2);
72447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[5] = _mm_slli_epi16(in[5], 2);
72547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[6] = _mm_slli_epi16(in[6], 2);
72647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[7] = _mm_slli_epi16(in[7], 2);
72747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
72847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
72947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org// right shift and rounding
73047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.orgstatic INLINE void right_shift_8x8(__m128i *res, int const bit) {
73147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i kOne = _mm_set1_epi16(1);
73247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const int bit_m02 = bit - 2;
73347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i sign0 = _mm_srai_epi16(res[0], 15);
73447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i sign1 = _mm_srai_epi16(res[1], 15);
73547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i sign2 = _mm_srai_epi16(res[2], 15);
73647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i sign3 = _mm_srai_epi16(res[3], 15);
73747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i sign4 = _mm_srai_epi16(res[4], 15);
73847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i sign5 = _mm_srai_epi16(res[5], 15);
73947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i sign6 = _mm_srai_epi16(res[6], 15);
74047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i sign7 = _mm_srai_epi16(res[7], 15);
74147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
74247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  if (bit_m02 >= 0) {
74347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02);
74447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    res[0] = _mm_add_epi16(res[0], k_const_rounding);
74547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    res[1] = _mm_add_epi16(res[1], k_const_rounding);
74647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    res[2] = _mm_add_epi16(res[2], k_const_rounding);
74747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    res[3] = _mm_add_epi16(res[3], k_const_rounding);
74847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    res[4] = _mm_add_epi16(res[4], k_const_rounding);
74947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    res[5] = _mm_add_epi16(res[5], k_const_rounding);
75047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    res[6] = _mm_add_epi16(res[6], k_const_rounding);
75147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    res[7] = _mm_add_epi16(res[7], k_const_rounding);
75247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  }
75347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
75447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[0] = _mm_sub_epi16(res[0], sign0);
75547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[1] = _mm_sub_epi16(res[1], sign1);
75647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[2] = _mm_sub_epi16(res[2], sign2);
75747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[3] = _mm_sub_epi16(res[3], sign3);
75847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[4] = _mm_sub_epi16(res[4], sign4);
75947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[5] = _mm_sub_epi16(res[5], sign5);
76047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[6] = _mm_sub_epi16(res[6], sign6);
76147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[7] = _mm_sub_epi16(res[7], sign7);
76247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
76347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[0] = _mm_srai_epi16(res[0], bit);
76447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[1] = _mm_srai_epi16(res[1], bit);
76547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[2] = _mm_srai_epi16(res[2], bit);
76647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[3] = _mm_srai_epi16(res[3], bit);
76747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[4] = _mm_srai_epi16(res[4], bit);
76847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[5] = _mm_srai_epi16(res[5], bit);
76947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[6] = _mm_srai_epi16(res[6], bit);
77047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[7] = _mm_srai_epi16(res[7], bit);
77147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
77247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
77347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org// write 8x8 array
77447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.orgstatic INLINE void write_buffer_8x8(int16_t *output, __m128i *res, int stride) {
77547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  _mm_store_si128((__m128i *)(output + 0 * stride), res[0]);
77647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  _mm_store_si128((__m128i *)(output + 1 * stride), res[1]);
77747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  _mm_store_si128((__m128i *)(output + 2 * stride), res[2]);
77847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  _mm_store_si128((__m128i *)(output + 3 * stride), res[3]);
77947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  _mm_store_si128((__m128i *)(output + 4 * stride), res[4]);
78047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  _mm_store_si128((__m128i *)(output + 5 * stride), res[5]);
78147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  _mm_store_si128((__m128i *)(output + 6 * stride), res[6]);
78247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  _mm_store_si128((__m128i *)(output + 7 * stride), res[7]);
78347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
78447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
785dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.orgvoid fdct8_sse2(__m128i *in) {
78647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // constants
78747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
78847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
78947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
79047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
79147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
79247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
79347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
79447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
79547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
79647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
79747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
79847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
79947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
80047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // stage 1
80147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s0 = _mm_add_epi16(in[0], in[7]);
80247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s1 = _mm_add_epi16(in[1], in[6]);
80347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s2 = _mm_add_epi16(in[2], in[5]);
80447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s3 = _mm_add_epi16(in[3], in[4]);
80547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s4 = _mm_sub_epi16(in[3], in[4]);
80647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s5 = _mm_sub_epi16(in[2], in[5]);
80747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s6 = _mm_sub_epi16(in[1], in[6]);
80847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s7 = _mm_sub_epi16(in[0], in[7]);
80947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
81047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u0 = _mm_add_epi16(s0, s3);
81147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u1 = _mm_add_epi16(s1, s2);
81247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u2 = _mm_sub_epi16(s1, s2);
81347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u3 = _mm_sub_epi16(s0, s3);
81447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // interleave and perform butterfly multiplication/addition
81547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v0 = _mm_unpacklo_epi16(u0, u1);
81647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v1 = _mm_unpackhi_epi16(u0, u1);
81747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v2 = _mm_unpacklo_epi16(u2, u3);
81847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v3 = _mm_unpackhi_epi16(u2, u3);
81947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
82047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
82147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
82247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
82347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
82447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
82547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
82647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
82747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
82847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
82947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // shift and rounding
83047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
83147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
83247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
83347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
83447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
83547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
83647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
83747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
83847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
83947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
84047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
84147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
84247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
84347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
84447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
84547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
84647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
84747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
84847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[0] = _mm_packs_epi32(u0, u1);
84947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[2] = _mm_packs_epi32(u4, u5);
85047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[4] = _mm_packs_epi32(u2, u3);
85147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[6] = _mm_packs_epi32(u6, u7);
85247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
85347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // stage 2
85447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // interleave and perform butterfly multiplication/addition
85547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u0 = _mm_unpacklo_epi16(s6, s5);
85647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u1 = _mm_unpackhi_epi16(s6, s5);
85747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
85847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
85947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
86047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
86147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
86247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // shift and rounding
86347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
86447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
86547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
86647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
86747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
86847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
86947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
87047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
87147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
87247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
87347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u0 = _mm_packs_epi32(v0, v1);
87447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u1 = _mm_packs_epi32(v2, v3);
87547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
87647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // stage 3
87747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s0 = _mm_add_epi16(s4, u0);
87847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s1 = _mm_sub_epi16(s4, u0);
87947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s2 = _mm_sub_epi16(s7, u1);
88047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s3 = _mm_add_epi16(s7, u1);
88147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
88247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // stage 4
88347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u0 = _mm_unpacklo_epi16(s0, s3);
88447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u1 = _mm_unpackhi_epi16(s0, s3);
88547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u2 = _mm_unpacklo_epi16(s1, s2);
88647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u3 = _mm_unpackhi_epi16(s1, s2);
88747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
88847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
88947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
89047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
89147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
89247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
89347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
89447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
89547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
89647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
89747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // shift and rounding
89847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
89947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
90047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
90147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
90247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
90347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
90447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
90547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
90647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
90747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
90847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
90947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
91047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
91147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
91247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
91347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
91447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
91547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
91647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[1] = _mm_packs_epi32(v0, v1);
91747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[3] = _mm_packs_epi32(v4, v5);
91847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[5] = _mm_packs_epi32(v2, v3);
91947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[7] = _mm_packs_epi32(v6, v7);
92047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
92147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // transpose
92247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  array_transpose_8x8(in, in);
92347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
92447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
925dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.orgvoid fadst8_sse2(__m128i *in) {
92647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // Constants
92747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
92847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
92947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
93047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
93147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
93247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
93347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
93447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
93547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
93647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
93747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
93847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
93947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
94047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__const_0 = _mm_set1_epi16(0);
94147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
94247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
94347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
94447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
94547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
94647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
94747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
94847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
94947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // properly aligned for butterfly input
95047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in0  = in[7];
95147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in1  = in[0];
95247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in2  = in[5];
95347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in3  = in[2];
95447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in4  = in[3];
95547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in5  = in[4];
95647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in6  = in[1];
95747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in7  = in[6];
95847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
95947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // column transformation
96047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // stage 1
96147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // interleave and multiply/add into 32-bit integer
96247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s0 = _mm_unpacklo_epi16(in0, in1);
96347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s1 = _mm_unpackhi_epi16(in0, in1);
96447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s2 = _mm_unpacklo_epi16(in2, in3);
96547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s3 = _mm_unpackhi_epi16(in2, in3);
96647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s4 = _mm_unpacklo_epi16(in4, in5);
96747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s5 = _mm_unpackhi_epi16(in4, in5);
96847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s6 = _mm_unpacklo_epi16(in6, in7);
96947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s7 = _mm_unpackhi_epi16(in6, in7);
97047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
97147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
97247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
97347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
97447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
97547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
97647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
97747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
97847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
97947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
98047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
98147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
98247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
98347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
98447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
98547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
98647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
98747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
98847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // addition
98947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w0 = _mm_add_epi32(u0, u8);
99047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w1 = _mm_add_epi32(u1, u9);
99147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w2 = _mm_add_epi32(u2, u10);
99247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w3 = _mm_add_epi32(u3, u11);
99347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w4 = _mm_add_epi32(u4, u12);
99447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w5 = _mm_add_epi32(u5, u13);
99547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w6 = _mm_add_epi32(u6, u14);
99647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w7 = _mm_add_epi32(u7, u15);
99747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w8 = _mm_sub_epi32(u0, u8);
99847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w9 = _mm_sub_epi32(u1, u9);
99947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w10 = _mm_sub_epi32(u2, u10);
100047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w11 = _mm_sub_epi32(u3, u11);
100147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w12 = _mm_sub_epi32(u4, u12);
100247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w13 = _mm_sub_epi32(u5, u13);
100347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w14 = _mm_sub_epi32(u6, u14);
100447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w15 = _mm_sub_epi32(u7, u15);
100547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
100647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // shift and rounding
100747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
100847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
100947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
101047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
101147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
101247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
101347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
101447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
101547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
101647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
101747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
101847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
101947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
102047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
102147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
102247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
102347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
102447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
102547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
102647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
102747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
102847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
102947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
103047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
103147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
103247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
103347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
103447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
103547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
103647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
103747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
103847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
103947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
104047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
104147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // back to 16-bit and pack 8 integers into __m128i
104247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[0] = _mm_packs_epi32(u0, u1);
104347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[1] = _mm_packs_epi32(u2, u3);
104447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[2] = _mm_packs_epi32(u4, u5);
104547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[3] = _mm_packs_epi32(u6, u7);
104647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[4] = _mm_packs_epi32(u8, u9);
104747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[5] = _mm_packs_epi32(u10, u11);
104847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[6] = _mm_packs_epi32(u12, u13);
104947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[7] = _mm_packs_epi32(u14, u15);
105047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
105147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // stage 2
105247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s0 = _mm_add_epi16(in[0], in[2]);
105347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s1 = _mm_add_epi16(in[1], in[3]);
105447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s2 = _mm_sub_epi16(in[0], in[2]);
105547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s3 = _mm_sub_epi16(in[1], in[3]);
105647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u0 = _mm_unpacklo_epi16(in[4], in[5]);
105747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u1 = _mm_unpackhi_epi16(in[4], in[5]);
105847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u2 = _mm_unpacklo_epi16(in[6], in[7]);
105947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u3 = _mm_unpackhi_epi16(in[6], in[7]);
106047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
106147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
106247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
106347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
106447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
106547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
106647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
106747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
106847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
106947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
107047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w0 = _mm_add_epi32(v0, v4);
107147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w1 = _mm_add_epi32(v1, v5);
107247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w2 = _mm_add_epi32(v2, v6);
107347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w3 = _mm_add_epi32(v3, v7);
107447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w4 = _mm_sub_epi32(v0, v4);
107547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w5 = _mm_sub_epi32(v1, v5);
107647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w6 = _mm_sub_epi32(v2, v6);
107747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w7 = _mm_sub_epi32(v3, v7);
107847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
107947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
108047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
108147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
108247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
108347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
108447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
108547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
108647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
108747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
108847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
108947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
109047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
109147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
109247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
109347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
109447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
109547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
109647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
109747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // back to 16-bit intergers
109847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s4 = _mm_packs_epi32(u0, u1);
109947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s5 = _mm_packs_epi32(u2, u3);
110047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s6 = _mm_packs_epi32(u4, u5);
110147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s7 = _mm_packs_epi32(u6, u7);
110247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
110347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // stage 3
110447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u0 = _mm_unpacklo_epi16(s2, s3);
110547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u1 = _mm_unpackhi_epi16(s2, s3);
110647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u2 = _mm_unpacklo_epi16(s6, s7);
110747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u3 = _mm_unpackhi_epi16(s6, s7);
110847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
110947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
111047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
111147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
111247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
111347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
111447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
111547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
111647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
111747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
111847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
111947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
112047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
112147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
112247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
112347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
112447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
112547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
112647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
112747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
112847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
112947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
113047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
113147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
113247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
113347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
113447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
113547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
113647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s2 = _mm_packs_epi32(v0, v1);
113747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s3 = _mm_packs_epi32(v2, v3);
113847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s6 = _mm_packs_epi32(v4, v5);
113947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s7 = _mm_packs_epi32(v6, v7);
114047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
114147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // FIXME(jingning): do subtract using bit inversion?
114247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[0] = s0;
114347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[1] = _mm_sub_epi16(k__const_0, s4);
114447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[2] = s6;
114547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[3] = _mm_sub_epi16(k__const_0, s2);
114647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[4] = s3;
114747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[5] = _mm_sub_epi16(k__const_0, s7);
114847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[6] = s5;
114947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[7] = _mm_sub_epi16(k__const_0, s1);
115047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
115147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // transpose
115247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  array_transpose_8x8(in, in);
115347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
115447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
115576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.orgvoid vp9_fht8x8_sse2(const int16_t *input, int16_t *output,
115676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                     int stride, int tx_type) {
115747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i in[8];
115876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
115947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  switch (tx_type) {
116076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    case DCT_DCT:
116176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      vp9_fdct8x8_sse2(input, output, stride);
116247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org      break;
116376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    case ADST_DCT:
116476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      load_buffer_8x8(input, in, stride);
1165dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      fadst8_sse2(in);
1166dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      fdct8_sse2(in);
116776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      right_shift_8x8(in, 1);
116876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      write_buffer_8x8(output, in, 8);
116947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org      break;
117076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    case DCT_ADST:
117176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      load_buffer_8x8(input, in, stride);
1172dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      fdct8_sse2(in);
1173dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      fadst8_sse2(in);
117476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      right_shift_8x8(in, 1);
117576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      write_buffer_8x8(output, in, 8);
117647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org      break;
117776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    case ADST_ADST:
117876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      load_buffer_8x8(input, in, stride);
1179dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      fadst8_sse2(in);
1180dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      fadst8_sse2(in);
118176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      right_shift_8x8(in, 1);
118276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      write_buffer_8x8(output, in, 8);
118347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org      break;
118447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    default:
118547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org      assert(0);
118647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org      break;
11873f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  }
11883f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org}
11893f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org
119088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.orgvoid vp9_fdct16x16_1_sse2(const int16_t *input, int16_t *output, int stride) {
119188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  __m128i in0, in1, in2, in3;
119288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  __m128i u0, u1;
119388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  __m128i sum = _mm_setzero_si128();
119488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  int i;
119588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
119688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  for (i = 0; i < 2; ++i) {
119788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    input += 8 * i;
119888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in0  = _mm_load_si128((const __m128i *)(input +  0 * stride));
119988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in1  = _mm_load_si128((const __m128i *)(input +  1 * stride));
120088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in2  = _mm_load_si128((const __m128i *)(input +  2 * stride));
120188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in3  = _mm_load_si128((const __m128i *)(input +  3 * stride));
120288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
120388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    u0 = _mm_add_epi16(in0, in1);
120488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    u1 = _mm_add_epi16(in2, in3);
120588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    sum = _mm_add_epi16(sum, u0);
120688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
120788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in0  = _mm_load_si128((const __m128i *)(input +  4 * stride));
120888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in1  = _mm_load_si128((const __m128i *)(input +  5 * stride));
120988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in2  = _mm_load_si128((const __m128i *)(input +  6 * stride));
121088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in3  = _mm_load_si128((const __m128i *)(input +  7 * stride));
121188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
121288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    sum = _mm_add_epi16(sum, u1);
121388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    u0  = _mm_add_epi16(in0, in1);
121488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    u1  = _mm_add_epi16(in2, in3);
121588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    sum = _mm_add_epi16(sum, u0);
121688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
121788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in0  = _mm_load_si128((const __m128i *)(input +  8 * stride));
121888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in1  = _mm_load_si128((const __m128i *)(input +  9 * stride));
121988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in2  = _mm_load_si128((const __m128i *)(input + 10 * stride));
122088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in3  = _mm_load_si128((const __m128i *)(input + 11 * stride));
122188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
122288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    sum = _mm_add_epi16(sum, u1);
122388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    u0  = _mm_add_epi16(in0, in1);
122488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    u1  = _mm_add_epi16(in2, in3);
122588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    sum = _mm_add_epi16(sum, u0);
122688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
122788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in0  = _mm_load_si128((const __m128i *)(input + 12 * stride));
122888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in1  = _mm_load_si128((const __m128i *)(input + 13 * stride));
122988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in2  = _mm_load_si128((const __m128i *)(input + 14 * stride));
123088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in3  = _mm_load_si128((const __m128i *)(input + 15 * stride));
123188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
123288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    sum = _mm_add_epi16(sum, u1);
123388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    u0  = _mm_add_epi16(in0, in1);
123488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    u1  = _mm_add_epi16(in2, in3);
123588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    sum = _mm_add_epi16(sum, u0);
123688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
123788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    sum = _mm_add_epi16(sum, u1);
123888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  }
123988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
124088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  u0  = _mm_setzero_si128();
124188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_unpacklo_epi16(u0, sum);
124288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1 = _mm_unpackhi_epi16(u0, sum);
124388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_srai_epi32(in0, 16);
124488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1 = _mm_srai_epi32(in1, 16);
124588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
124688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  sum = _mm_add_epi32(in0, in1);
124788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_unpacklo_epi32(sum, u0);
124888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1 = _mm_unpackhi_epi32(sum, u0);
124988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
125088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  sum = _mm_add_epi32(in0, in1);
125188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_srli_si128(sum, 8);
125288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
125388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1 = _mm_add_epi32(sum, in0);
125488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1 = _mm_srai_epi32(in1, 1);
125588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  _mm_store_si128((__m128i *)(output), in1);
125688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org}
125788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
1258ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgvoid vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
12593f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // The 2D transform is done with two passes which are actually pretty
12603f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // similar. In the first one, we transform the columns and transpose
12613f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // the results. In the second one, we transform the rows. To achieve that,
1262411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // as the first pass results are transposed, we transpose the columns (that
12633f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // is the transposed rows) and transpose the results (so that it goes back
12643f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // in normal/row positions).
12653f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  int pass;
12663f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // We need an intermediate buffer between passes.
126747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);
1268ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  const int16_t *in = input;
12693f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  int16_t *out = intermediate;
12703f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // Constants
12713f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  //    When we use them, in one case, they are all the same. In all others
12723f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  //    it's a pair of them that we need to repeat four times. This is done
12733f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  //    by constructing the 32 bit constant corresponding to that pair.
12743f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
12753f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
12763f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
127788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
12783f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
12793f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
12803f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
12813f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
12823f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
12833f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
12843f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
12853f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
12863f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
12873f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
12883f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
12893f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
12903f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
12913f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
12923f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i kOne = _mm_set1_epi16(1);
12933f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // Do the two transform/transpose passes
12943f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  for (pass = 0; pass < 2; ++pass) {
12953f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    // We process eight columns (transposed rows in second pass) at a time.
12963f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    int column_start;
12973f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    for (column_start = 0; column_start < 16; column_start += 8) {
12983f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      __m128i in00, in01, in02, in03, in04, in05, in06, in07;
12993f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      __m128i in08, in09, in10, in11, in12, in13, in14, in15;
13003f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      __m128i input0, input1, input2, input3, input4, input5, input6, input7;
13013f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      __m128i step1_0, step1_1, step1_2, step1_3;
13023f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      __m128i step1_4, step1_5, step1_6, step1_7;
13033f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
13043f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      __m128i step3_0, step3_1, step3_2, step3_3;
13053f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      __m128i step3_4, step3_5, step3_6, step3_7;
13063f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      __m128i res00, res01, res02, res03, res04, res05, res06, res07;
13073f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      __m128i res08, res09, res10, res11, res12, res13, res14, res15;
13083f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // Load and pre-condition input.
13093f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      if (0 == pass) {
131047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in00  = _mm_load_si128((const __m128i *)(in +  0 * stride));
131147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in01  = _mm_load_si128((const __m128i *)(in +  1 * stride));
131247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in02  = _mm_load_si128((const __m128i *)(in +  2 * stride));
131347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in03  = _mm_load_si128((const __m128i *)(in +  3 * stride));
131447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in04  = _mm_load_si128((const __m128i *)(in +  4 * stride));
131547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in05  = _mm_load_si128((const __m128i *)(in +  5 * stride));
131647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in06  = _mm_load_si128((const __m128i *)(in +  6 * stride));
131747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in07  = _mm_load_si128((const __m128i *)(in +  7 * stride));
131847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in08  = _mm_load_si128((const __m128i *)(in +  8 * stride));
131947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in09  = _mm_load_si128((const __m128i *)(in +  9 * stride));
132047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in10  = _mm_load_si128((const __m128i *)(in + 10 * stride));
132147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in11  = _mm_load_si128((const __m128i *)(in + 11 * stride));
132247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in12  = _mm_load_si128((const __m128i *)(in + 12 * stride));
132347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in13  = _mm_load_si128((const __m128i *)(in + 13 * stride));
132447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in14  = _mm_load_si128((const __m128i *)(in + 14 * stride));
132547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in15  = _mm_load_si128((const __m128i *)(in + 15 * stride));
13263f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // x = x << 2
13273f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in00 = _mm_slli_epi16(in00, 2);
13283f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in01 = _mm_slli_epi16(in01, 2);
13293f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in02 = _mm_slli_epi16(in02, 2);
13303f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in03 = _mm_slli_epi16(in03, 2);
13313f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in04 = _mm_slli_epi16(in04, 2);
13323f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in05 = _mm_slli_epi16(in05, 2);
13333f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in06 = _mm_slli_epi16(in06, 2);
13343f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in07 = _mm_slli_epi16(in07, 2);
13353f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in08 = _mm_slli_epi16(in08, 2);
13363f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in09 = _mm_slli_epi16(in09, 2);
13373f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in10 = _mm_slli_epi16(in10, 2);
13383f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in11 = _mm_slli_epi16(in11, 2);
13393f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in12 = _mm_slli_epi16(in12, 2);
13403f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in13 = _mm_slli_epi16(in13, 2);
13413f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in14 = _mm_slli_epi16(in14, 2);
13423f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in15 = _mm_slli_epi16(in15, 2);
13433f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      } else {
134447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in00  = _mm_load_si128((const __m128i *)(in +  0 * 16));
134547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in01  = _mm_load_si128((const __m128i *)(in +  1 * 16));
134647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in02  = _mm_load_si128((const __m128i *)(in +  2 * 16));
134747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in03  = _mm_load_si128((const __m128i *)(in +  3 * 16));
134847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in04  = _mm_load_si128((const __m128i *)(in +  4 * 16));
134947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in05  = _mm_load_si128((const __m128i *)(in +  5 * 16));
135047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in06  = _mm_load_si128((const __m128i *)(in +  6 * 16));
135147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in07  = _mm_load_si128((const __m128i *)(in +  7 * 16));
135247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in08  = _mm_load_si128((const __m128i *)(in +  8 * 16));
135347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in09  = _mm_load_si128((const __m128i *)(in +  9 * 16));
135447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in10  = _mm_load_si128((const __m128i *)(in + 10 * 16));
135547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in11  = _mm_load_si128((const __m128i *)(in + 11 * 16));
135647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in12  = _mm_load_si128((const __m128i *)(in + 12 * 16));
135747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in13  = _mm_load_si128((const __m128i *)(in + 13 * 16));
135847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in14  = _mm_load_si128((const __m128i *)(in + 14 * 16));
135947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in15  = _mm_load_si128((const __m128i *)(in + 15 * 16));
13603f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // x = (x + 1) >> 2
13613f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in00 = _mm_add_epi16(in00, kOne);
13623f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in01 = _mm_add_epi16(in01, kOne);
13633f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in02 = _mm_add_epi16(in02, kOne);
13643f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in03 = _mm_add_epi16(in03, kOne);
13653f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in04 = _mm_add_epi16(in04, kOne);
13663f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in05 = _mm_add_epi16(in05, kOne);
13673f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in06 = _mm_add_epi16(in06, kOne);
13683f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in07 = _mm_add_epi16(in07, kOne);
13693f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in08 = _mm_add_epi16(in08, kOne);
13703f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in09 = _mm_add_epi16(in09, kOne);
13713f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in10 = _mm_add_epi16(in10, kOne);
13723f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in11 = _mm_add_epi16(in11, kOne);
13733f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in12 = _mm_add_epi16(in12, kOne);
13743f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in13 = _mm_add_epi16(in13, kOne);
13753f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in14 = _mm_add_epi16(in14, kOne);
13763f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in15 = _mm_add_epi16(in15, kOne);
13773f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in00 = _mm_srai_epi16(in00, 2);
13783f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in01 = _mm_srai_epi16(in01, 2);
13793f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in02 = _mm_srai_epi16(in02, 2);
13803f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in03 = _mm_srai_epi16(in03, 2);
13813f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in04 = _mm_srai_epi16(in04, 2);
13823f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in05 = _mm_srai_epi16(in05, 2);
13833f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in06 = _mm_srai_epi16(in06, 2);
13843f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in07 = _mm_srai_epi16(in07, 2);
13853f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in08 = _mm_srai_epi16(in08, 2);
13863f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in09 = _mm_srai_epi16(in09, 2);
13873f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in10 = _mm_srai_epi16(in10, 2);
13883f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in11 = _mm_srai_epi16(in11, 2);
13893f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in12 = _mm_srai_epi16(in12, 2);
13903f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in13 = _mm_srai_epi16(in13, 2);
13913f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in14 = _mm_srai_epi16(in14, 2);
13923f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in15 = _mm_srai_epi16(in15, 2);
13933f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      }
13943f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      in += 8;
13953f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // Calculate input for the first 8 results.
13963f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      {
13973f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        input0 = _mm_add_epi16(in00, in15);
13983f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        input1 = _mm_add_epi16(in01, in14);
13993f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        input2 = _mm_add_epi16(in02, in13);
14003f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        input3 = _mm_add_epi16(in03, in12);
14013f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        input4 = _mm_add_epi16(in04, in11);
14023f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        input5 = _mm_add_epi16(in05, in10);
14033f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        input6 = _mm_add_epi16(in06, in09);
14043f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        input7 = _mm_add_epi16(in07, in08);
14053f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      }
14063f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // Calculate input for the next 8 results.
14073f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      {
14083f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        step1_0 = _mm_sub_epi16(in07, in08);
14093f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        step1_1 = _mm_sub_epi16(in06, in09);
14103f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        step1_2 = _mm_sub_epi16(in05, in10);
14113f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        step1_3 = _mm_sub_epi16(in04, in11);
14123f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        step1_4 = _mm_sub_epi16(in03, in12);
14133f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        step1_5 = _mm_sub_epi16(in02, in13);
14143f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        step1_6 = _mm_sub_epi16(in01, in14);
14153f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        step1_7 = _mm_sub_epi16(in00, in15);
14163f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      }
1417dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      // Work on the first eight values; fdct8(input, even_results);
14183f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      {
1419411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org        // Add/subtract
14203f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i q0 = _mm_add_epi16(input0, input7);
14213f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i q1 = _mm_add_epi16(input1, input6);
14223f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i q2 = _mm_add_epi16(input2, input5);
14233f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i q3 = _mm_add_epi16(input3, input4);
14243f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i q4 = _mm_sub_epi16(input3, input4);
14253f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i q5 = _mm_sub_epi16(input2, input5);
14263f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i q6 = _mm_sub_epi16(input1, input6);
14273f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i q7 = _mm_sub_epi16(input0, input7);
14283f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // Work on first four results
14293f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        {
1430411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org          // Add/subtract
14313f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i r0 = _mm_add_epi16(q0, q3);
14323f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i r1 = _mm_add_epi16(q1, q2);
14333f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i r2 = _mm_sub_epi16(q1, q2);
14343f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i r3 = _mm_sub_epi16(q0, q3);
14353f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // Interleave to do the multiply by constants which gets us
14363f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // into 32 bits.
14373f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
14383f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
14393f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
14403f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
14413f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
14423f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
14433f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
14443f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
14453f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
14463f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
14473f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
14483f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
14493f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // dct_const_round_shift
14503f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
14513f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
14523f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
14533f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
14543f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
14553f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
14563f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
14573f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
14583f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
14593f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
14603f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
14613f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
14623f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
14633f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
14643f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
14653f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
14663f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // Combine
14673f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res00 = _mm_packs_epi32(w0, w1);
14683f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res08 = _mm_packs_epi32(w2, w3);
14693f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res04 = _mm_packs_epi32(w4, w5);
14703f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res12 = _mm_packs_epi32(w6, w7);
14713f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        }
14723f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // Work on next four results
14733f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        {
14743f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // Interleave to do the multiply by constants which gets us
14753f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // into 32 bits.
14763f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
14773f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
14783f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
14793f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
14803f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
14813f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
14823f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // dct_const_round_shift
14833f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
14843f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
14853f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
14863f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
14873f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
14883f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
14893f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
14903f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
14913f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // Combine
14923f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i r0 = _mm_packs_epi32(s0, s1);
14933f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i r1 = _mm_packs_epi32(s2, s3);
1494411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org          // Add/subtract
14953f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i x0 = _mm_add_epi16(q4, r0);
14963f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i x1 = _mm_sub_epi16(q4, r0);
14973f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i x2 = _mm_sub_epi16(q7, r1);
14983f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i x3 = _mm_add_epi16(q7, r1);
14993f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // Interleave to do the multiply by constants which gets us
15003f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // into 32 bits.
15013f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
15023f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
15033f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
15043f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
15053f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
15063f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
15073f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
15083f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
15093f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
15103f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
15113f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
15123f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
15133f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // dct_const_round_shift
15143f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
15153f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
15163f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
15173f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
15183f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
15193f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
15203f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
15213f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
15223f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
15233f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
15243f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
15253f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
15263f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
15273f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
15283f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
15293f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
15303f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // Combine
15313f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res02 = _mm_packs_epi32(w0, w1);
15323f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res14 = _mm_packs_epi32(w2, w3);
15333f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res10 = _mm_packs_epi32(w4, w5);
15343f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res06 = _mm_packs_epi32(w6, w7);
15353f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        }
15363f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      }
15373f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // Work on the next eight values; step1 -> odd_results
15383f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      {
15393f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // step 2
15403f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        {
15413f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
15423f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
15433f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
15443f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
15453f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16);
15463f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16);
15473f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16);
15483f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16);
15493f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // dct_const_round_shift
15503f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
15513f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
15523f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
15533f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
15543f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
15553f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
15563f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
15573f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
15583f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // Combine
15593f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step2_2 = _mm_packs_epi32(w0, w1);
15603f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step2_3 = _mm_packs_epi32(w2, w3);
15613f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        }
15623f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        {
15633f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
15643f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
15653f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
15663f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
15673f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
15683f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
15693f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16);
15703f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16);
15713f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // dct_const_round_shift
15723f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
15733f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
15743f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
15753f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
15763f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
15773f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
15783f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
15793f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
15803f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // Combine
15813f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step2_5 = _mm_packs_epi32(w0, w1);
15823f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step2_4 = _mm_packs_epi32(w2, w3);
15833f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        }
15843f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // step 3
15853f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        {
15863f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step3_0 = _mm_add_epi16(step1_0, step2_3);
15873f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step3_1 = _mm_add_epi16(step1_1, step2_2);
15883f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step3_2 = _mm_sub_epi16(step1_1, step2_2);
15893f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step3_3 = _mm_sub_epi16(step1_0, step2_3);
15903f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step3_4 = _mm_sub_epi16(step1_7, step2_4);
15913f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step3_5 = _mm_sub_epi16(step1_6, step2_5);
15923f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step3_6 = _mm_add_epi16(step1_6, step2_5);
15933f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step3_7 = _mm_add_epi16(step1_7, step2_4);
15943f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        }
15953f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // step 4
15963f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        {
15973f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
15983f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
15993f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
16003f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
16013f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24);
16023f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24);
160388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p24_p08);
160488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p24_p08);
16053f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // dct_const_round_shift
16063f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
16073f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
16083f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
16093f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
16103f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
16113f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
16123f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
16133f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
16143f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // Combine
16153f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step2_1 = _mm_packs_epi32(w0, w1);
16163f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step2_2 = _mm_packs_epi32(w2, w3);
16173f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        }
16183f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        {
16193f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
16203f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
16213f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
16223f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
16233f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08);
16243f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08);
162588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p08_m24);
162688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p08_m24);
16273f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // dct_const_round_shift
16283f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
16293f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
16303f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
16313f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
16323f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
16333f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
16343f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
16353f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
16363f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // Combine
16373f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step2_6 = _mm_packs_epi32(w0, w1);
16383f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step2_5 = _mm_packs_epi32(w2, w3);
16393f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        }
16403f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // step 5
16413f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        {
16423f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step1_0 = _mm_add_epi16(step3_0, step2_1);
16433f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step1_1 = _mm_sub_epi16(step3_0, step2_1);
164488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org          step1_2 = _mm_add_epi16(step3_3, step2_2);
164588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org          step1_3 = _mm_sub_epi16(step3_3, step2_2);
164688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org          step1_4 = _mm_sub_epi16(step3_4, step2_5);
164788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org          step1_5 = _mm_add_epi16(step3_4, step2_5);
16483f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step1_6 = _mm_sub_epi16(step3_7, step2_6);
16493f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step1_7 = _mm_add_epi16(step3_7, step2_6);
16503f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        }
16513f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // step 6
16523f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        {
16533f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
16543f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
16553f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
16563f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
16573f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02);
16583f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02);
16593f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18);
16603f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18);
16613f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // dct_const_round_shift
16623f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
16633f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
16643f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
16653f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
16663f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
16673f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
16683f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
16693f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
16703f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // Combine
16713f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res01 = _mm_packs_epi32(w0, w1);
16723f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res09 = _mm_packs_epi32(w2, w3);
16733f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        }
16743f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        {
16753f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
16763f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
16773f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
16783f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
16793f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10);
16803f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10);
16813f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26);
16823f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26);
16833f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // dct_const_round_shift
16843f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
16853f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
16863f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
16873f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
16883f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
16893f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
16903f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
16913f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
16923f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // Combine
16933f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res05 = _mm_packs_epi32(w0, w1);
16943f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res13 = _mm_packs_epi32(w2, w3);
16953f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        }
16963f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        {
16973f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
16983f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
16993f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
17003f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
17013f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22);
17023f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22);
17033f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06);
17043f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06);
17053f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // dct_const_round_shift
17063f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
17073f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
17083f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
17093f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
17103f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
17113f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
17123f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
17133f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
17143f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // Combine
17153f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res11 = _mm_packs_epi32(w0, w1);
17163f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res03 = _mm_packs_epi32(w2, w3);
17173f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        }
17183f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        {
17193f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
17203f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
17213f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
17223f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
17233f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30);
17243f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30);
17253f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14);
17263f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14);
17273f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // dct_const_round_shift
17283f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
17293f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
17303f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
17313f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
17323f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
17333f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
17343f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
17353f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
17363f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // Combine
17373f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res15 = _mm_packs_epi32(w0, w1);
17383f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res07 = _mm_packs_epi32(w2, w3);
17393f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        }
17403f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      }
17413f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // Transpose the results, do it as two 8x8 transposes.
17423f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      {
17433f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 00 01 02 03 04 05 06 07
17443f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 10 11 12 13 14 15 16 17
17453f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 20 21 22 23 24 25 26 27
17463f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 30 31 32 33 34 35 36 37
17473f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 40 41 42 43 44 45 46 47
17483f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 50 51 52 53 54 55 56 57
17493f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 60 61 62 63 64 65 66 67
17503f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 70 71 72 73 74 75 76 77
17513f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01);
17523f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03);
17533f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01);
17543f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03);
17553f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05);
17563f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07);
17573f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05);
17583f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07);
17593f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 00 10 01 11 02 12 03 13
17603f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 20 30 21 31 22 32 23 33
17613f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 04 14 05 15 06 16 07 17
17623f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 24 34 25 35 26 36 27 37
17633f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 40 50 41 51 42 52 43 53
17643f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 60 70 61 71 62 72 63 73
17653f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 54 54 55 55 56 56 57 57
17663f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 64 74 65 75 66 76 67 77
17673f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
17683f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
17693f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
17703f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
17713f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
17723f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
17733f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
17743f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
17753f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 00 10 20 30 01 11 21 31
17763f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 40 50 60 70 41 51 61 71
17773f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 02 12 22 32 03 13 23 33
17783f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 42 52 62 72 43 53 63 73
17793f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 04 14 24 34 05 15 21 36
17803f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 44 54 64 74 45 55 61 76
17813f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 06 16 26 36 07 17 27 37
17823f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 46 56 66 76 47 57 67 77
17833f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
17843f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const