13f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org/*
23f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
33f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org *
43f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org *  Use of this source code is governed by a BSD-style license
53f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org *  that can be found in the LICENSE file in the root of the source
63f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org *  tree. An additional intellectual property rights grant can be found
73f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org *  in the file PATENTS.  All contributing project authors may
83f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org *  be found in the AUTHORS file in the root of the source tree.
93f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org */
103f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org
113f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org#include <emmintrin.h>  // SSE2
123f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org#include "vp9/common/vp9_idct.h"  // for cospi constants
1347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org#include "vpx_ports/mem.h"
143f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org
15d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org#include "vp9/common/x86/vp9_idct_intrin_sse2.h"
16d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
1788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.orgvoid vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride) {
1888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  __m128i in0, in1;
1988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  __m128i tmp;
2088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  const __m128i zero = _mm_setzero_si128();
2188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
2288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
2388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1  = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
2488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org         (input +  2 * stride)));
2588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0  = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
2688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org         (input +  3 * stride)));
2788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
2888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  tmp = _mm_add_epi16(in0, in1);
2988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_unpacklo_epi16(zero, tmp);
3088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1 = _mm_unpackhi_epi16(zero, tmp);
3188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_srai_epi32(in0, 16);
3288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1 = _mm_srai_epi32(in1, 16);
3388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
3488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  tmp = _mm_add_epi32(in0, in1);
3588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_unpacklo_epi32(tmp, zero);
3688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1 = _mm_unpackhi_epi32(tmp, zero);
3788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
3888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  tmp = _mm_add_epi32(in0, in1);
3988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_srli_si128(tmp, 8);
4088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
4188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1 = _mm_add_epi32(tmp, in0);
4288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_slli_epi32(in1, 1);
4388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  _mm_store_si128((__m128i *)(output), in0);
4488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org}
4588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
46ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgvoid vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
4793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // This 2D transform implements 4 vertical 1D transforms followed
4893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // by 4 horizontal 1D transforms.  The multiplies and adds are as given
4993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // by Chen, Smith and Fralick ('77).  The commands for moving the data
5093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // around have been minimized by hand.
5193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // For the purposes of the comments, the 16 inputs are referred to at i0
5293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // through iF (in raster order), intermediate variables are a0, b0, c0
5393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // through f, and correspond to the in-place computations mapped to input
5493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // locations.  The outputs, o0 through oF are labeled according to the
5593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // output locations.
5693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
573f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // Constants
5893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // These are the coefficients used for the multiplies.
5993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
6093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // where cospi_N_64 = cos(N pi /64)
6193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  const __m128i k__cospi_A = _mm_setr_epi16(cospi_16_64, cospi_16_64,
6293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_16_64, cospi_16_64,
6393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_16_64, -cospi_16_64,
6493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_16_64, -cospi_16_64);
6593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  const __m128i k__cospi_B = _mm_setr_epi16(cospi_16_64, -cospi_16_64,
6693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_16_64, -cospi_16_64,
6793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_16_64, cospi_16_64,
6893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_16_64, cospi_16_64);
6993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  const __m128i k__cospi_C = _mm_setr_epi16(cospi_8_64, cospi_24_64,
7093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_8_64, cospi_24_64,
7193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_24_64, -cospi_8_64,
7293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_24_64, -cospi_8_64);
7393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  const __m128i k__cospi_D = _mm_setr_epi16(cospi_24_64, -cospi_8_64,
7493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_24_64, -cospi_8_64,
7593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_8_64, cospi_24_64,
7693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_8_64, cospi_24_64);
7793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  const __m128i k__cospi_E = _mm_setr_epi16(cospi_16_64, cospi_16_64,
7893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_16_64, cospi_16_64,
7993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_16_64, cospi_16_64,
8093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_16_64, cospi_16_64);
8193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  const __m128i k__cospi_F = _mm_setr_epi16(cospi_16_64, -cospi_16_64,
8293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_16_64, -cospi_16_64,
8393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_16_64, -cospi_16_64,
8493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_16_64, -cospi_16_64);
8593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  const __m128i k__cospi_G = _mm_setr_epi16(cospi_8_64, cospi_24_64,
8693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_8_64, cospi_24_64,
8793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            -cospi_8_64, -cospi_24_64,
8893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            -cospi_8_64, -cospi_24_64);
8993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  const __m128i k__cospi_H = _mm_setr_epi16(cospi_24_64, -cospi_8_64,
9093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            cospi_24_64, -cospi_8_64,
9193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            -cospi_24_64, cospi_8_64,
9293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                            -cospi_24_64, cospi_8_64);
9393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
943f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
9593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // This second rounding constant saves doing some extra adds at the end
9693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  const __m128i k__DCT_CONST_ROUNDING2 = _mm_set1_epi32(DCT_CONST_ROUNDING
9793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                                               +(DCT_CONST_ROUNDING << 1));
9893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  const int DCT_CONST_BITS2 =  DCT_CONST_BITS+2;
993f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
1003f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
101d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  __m128i in0, in1;
10293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
1033f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // Load inputs.
1043f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  {
1053f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
10693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
10793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    in1  = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
10893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org           (input +  2 * stride)));
109d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    in0  = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
11093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org           (input +  3 * stride)));
11193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // in0 = [i0 i1 i2 i3 iC iD iE iF]
11293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
113d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
11493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
11593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // multiply by 16 to give some extra precision
1163f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in0 = _mm_slli_epi16(in0, 4);
1173f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in1 = _mm_slli_epi16(in1, 4);
1183f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    // if (i == 0 && input[0]) input[0] += 1;
11993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // add 1 to the upper left pixel if it is non-zero, which helps reduce
12093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // the round-trip error
1213f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    {
122411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org      // The mask will only contain whether the first value is zero, all
1233f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // other comparison will fail as something shifted by 4 (above << 4)
1243f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // can never be equal to one. To increment in the non-zero case, we
1253f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // add the mask and one for the first element:
1263f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      //   - if zero, mask = -1, v = v - 1 + 1 = v
1273f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      //   - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
1283f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
1293f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      in0 = _mm_add_epi16(in0, mask);
1303f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
1313f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    }
1323f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  }
13393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // There are 4 total stages, alternating between an add/subtract stage
13493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // followed by an multiply-and-add stage.
13593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  {
13693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // Stage 1: Add/subtract
13793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
13893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // in0 = [i0 i1 i2 i3 iC iD iE iF]
13993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
14093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i r0 = _mm_unpacklo_epi16(in0, in1);
14193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i r1 = _mm_unpackhi_epi16(in0, in1);
14293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // r0 = [i0 i4 i1 i5 i2 i6 i3 i7]
14393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // r1 = [iC i8 iD i9 iE iA iF iB]
14493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4);
14593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4);
14693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // r2 = [i0 i4 i1 i5 i3 i7 i2 i6]
14793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // r3 = [iC i8 iD i9 iF iB iE iA]
14893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
14993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i t0 = _mm_add_epi16(r2, r3);
15093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i t1 = _mm_sub_epi16(r2, r3);
15193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // t0 = [a0 a4 a1 a5 a3 a7 a2 a6]
15293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // t1 = [aC a8 aD a9 aF aB aE aA]
15393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
15493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // Stage 2: multiply by constants (which gets us into 32 bits).
15593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // The constants needed here are:
15693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16]
15793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16]
15893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08]
15993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24]
16093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A);
16193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B);
16293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C);
16393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D);
16493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // Then add and right-shift to get back to 16-bit range
1653f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
16693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
1673f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
16893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
1693f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
17093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
1713f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
17293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
17393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // w0 = [b0 b1 b7 b6]
17493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // w1 = [b8 b9 bF bE]
17593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // w2 = [b4 b5 b3 b2]
17693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // w3 = [bC bD bB bA]
17793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i x0 = _mm_packs_epi32(w0, w1);
17893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i x1 = _mm_packs_epi32(w2, w3);
17993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // x0 = [b0 b1 b7 b6 b8 b9 bF bE]
18093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // x1 = [b4 b5 b3 b2 bC bD bB bA]
18193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    in0 = _mm_shuffle_epi32(x0, 0xD8);
18293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    in1 = _mm_shuffle_epi32(x1, 0x8D);
18393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // in0 = [b0 b1 b8 b9 b7 b6 bF bE]
18493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // in1 = [b3 b2 bB bA b4 b5 bC bD]
185d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  }
186d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  {
18793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // vertical DCTs finished. Now we do the horizontal DCTs.
18893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // Stage 3: Add/subtract
18993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
19093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i t0 = _mm_add_epi16(in0, in1);
19193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i t1 = _mm_sub_epi16(in0, in1);
19293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // t0 = [c0 c1 c8 c9  c4  c5  cC  cD]
19393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE]
19493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
19593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // Stage 4: multiply by constants (which gets us into 32 bits).
19693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // The constants needed here are:
19793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16]
19893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16]
19993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24]
20093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08]
20193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E);
20293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F);
20393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G);
20493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H);
20593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // Then add and right-shift to get back to 16-bit range
20693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // but this combines the final right-shift as well to save operations
20793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // This unusual rounding operations is to maintain bit-accurate
20893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // compatibility with the c version of this function which has two
20993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // rounding steps in a row.
21093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2);
21193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2);
21293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2);
21393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2);
21493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2);
21593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2);
21693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2);
21793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2);
21893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // w0 = [o0 o4 o8 oC]
21993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // w1 = [o2 o6 oA oE]
22093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // w2 = [o1 o5 o9 oD]
22193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // w3 = [o3 o7 oB oF]
22293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // remember the o's are numbered according to the correct output location
22393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i x0 = _mm_packs_epi32(w0, w1);
22493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i x1 = _mm_packs_epi32(w2, w3);
22593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // x0 = [o0 o4 o8 oC o2 o6 oA oE]
22693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // x1 = [o1 o5 o9 oD o3 o7 oB oF]
22793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i y0 = _mm_unpacklo_epi16(x0, x1);
22893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    const __m128i y1 = _mm_unpackhi_epi16(x0, x1);
22993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // y0 = [o0 o1 o4 o5 o8 o9 oC oD]
23093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // y1 = [o2 o3 o6 o7 oA oB oE oF]
23193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    in0 = _mm_unpacklo_epi32(y0, y1);
23293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // in0 = [o0 o1 o2 o3 o4 o5 o6 o7]
23393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    in1 = _mm_unpackhi_epi32(y0, y1);
23493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // in1 = [o8 o9 oA oB oC oD oE oF]
23593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  }
23693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // Post-condition (v + 1) >> 2 is now incorporated into previous
23793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // add and right-shift commands.  Only 2 store instructions needed
23893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  // because we are using the fact that 1/3 are stored just after 0/2.
23993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  {
24093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org     _mm_storeu_si128((__m128i *)(output + 0 * 4), in0);
24193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org     _mm_storeu_si128((__m128i *)(output + 2 * 4), in1);
2423f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  }
2433f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org}
2443f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org
24593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
246ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgstatic INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
247ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org                                   int stride) {
24847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
24947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
25047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i mask;
25147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
25247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
25347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
25447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
25547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
25647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
25747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[0] = _mm_slli_epi16(in[0], 4);
25847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[1] = _mm_slli_epi16(in[1], 4);
25947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[2] = _mm_slli_epi16(in[2], 4);
26047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[3] = _mm_slli_epi16(in[3], 4);
26147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
26247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
26347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[0] = _mm_add_epi16(in[0], mask);
26447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
26547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
26647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
26747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.orgstatic INLINE void write_buffer_4x4(int16_t *output, __m128i *res) {
26847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i kOne = _mm_set1_epi16(1);
26947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
27047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
27147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i out01 = _mm_add_epi16(in01, kOne);
27247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i out23 = _mm_add_epi16(in23, kOne);
27347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  out01 = _mm_srai_epi16(out01, 2);
27447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  out23 = _mm_srai_epi16(out23, 2);
27547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  _mm_store_si128((__m128i *)(output + 0 * 8), out01);
27647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  _mm_store_si128((__m128i *)(output + 1 * 8), out23);
27747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
27847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
27947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.orgstatic INLINE void transpose_4x4(__m128i *res) {
28047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // Combine and transpose
28147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // 00 01 02 03 20 21 22 23
28247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // 10 11 12 13 30 31 32 33
28347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
28447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
28547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
28647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // 00 10 01 11 02 12 03 13
28747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // 20 30 21 31 22 32 23 33
28847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
28947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
29047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
29147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // 00 10 20 30 01 11 21 31
29247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // 02 12 22 32 03 13 23 33
29347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // only use the first 4 16-bit integers
29447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[1] = _mm_unpackhi_epi64(res[0], res[0]);
29547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[3] = _mm_unpackhi_epi64(res[2], res[2]);
29647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
29747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
298dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.orgvoid fdct4_sse2(__m128i *in) {
29947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
30047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
301ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
302ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
30347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
30447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
30547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i u[4], v[4];
306ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  u[0]=_mm_unpacklo_epi16(in[0], in[1]);
307ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  u[1]=_mm_unpacklo_epi16(in[3], in[2]);
308ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org
309ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  v[0] = _mm_add_epi16(u[0], u[1]);
310ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  v[1] = _mm_sub_epi16(u[0], u[1]);
31147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
31247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);  // 0
31347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16);  // 2
314ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24);  // 1
315ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08);  // 3
31647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
31747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
31847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
31947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
32047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
32147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
32247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
32347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
32447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
32547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
32647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[0] = _mm_packs_epi32(u[0], u[1]);
32747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[1] = _mm_packs_epi32(u[2], u[3]);
32847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  transpose_4x4(in);
32947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
33047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
331dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.orgvoid fadst4_sse2(__m128i *in) {
33247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
33347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
33447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
33547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
33647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
33747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i kZero = _mm_set1_epi16(0);
33847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
33947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i u[8], v[8];
34047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i in7 = _mm_add_epi16(in[0], in[1]);
34147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
34247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
34347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_unpacklo_epi16(in[2], in[3]);
34447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[2] = _mm_unpacklo_epi16(in7, kZero);
34547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[3] = _mm_unpacklo_epi16(in[2], kZero);
346d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  u[4] = _mm_unpacklo_epi16(in[3], kZero);
34747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
34847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02);  // s0 + s2
34947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04);  // s4 + s5
35047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x1
35147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01);  // s1 - s3
35247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02);  // -s4 + s6
35347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s4
354d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
35547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
35647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_add_epi32(v[0], v[1]);
357d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  u[1] = _mm_sub_epi32(v[2], v[6]);
35847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[2] = _mm_add_epi32(v[3], v[4]);
35947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[3] = _mm_sub_epi32(u[2], u[0]);
36047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[4] = _mm_slli_epi32(v[5], 2);
36147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[5] = _mm_sub_epi32(u[4], v[5]);
36247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[6] = _mm_add_epi32(u[3], u[5]);
36347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
36447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
36547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
36647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
36747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
36847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
36947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
37047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
37147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
37247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
37347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
37447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[0] = _mm_packs_epi32(u[0], u[2]);
37547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[1] = _mm_packs_epi32(u[1], u[3]);
37647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  transpose_4x4(in);
37747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
37847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
37976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.orgvoid vp9_fht4x4_sse2(const int16_t *input, int16_t *output,
38076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                     int stride, int tx_type) {
38147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i in[4];
38276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
38347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  switch (tx_type) {
38476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    case DCT_DCT:
38576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      vp9_fdct4x4_sse2(input, output, stride);
38647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org      break;
38776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    case ADST_DCT:
38876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      load_buffer_4x4(input, in, stride);
389dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      fadst4_sse2(in);
390dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      fdct4_sse2(in);
39176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      write_buffer_4x4(output, in);
39247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org      break;
39376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    case DCT_ADST:
39476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      load_buffer_4x4(input, in, stride);
395dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      fdct4_sse2(in);
396dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      fadst4_sse2(in);
39776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      write_buffer_4x4(output, in);
39847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org      break;
39976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    case ADST_ADST:
40076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      load_buffer_4x4(input, in, stride);
401dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      fadst4_sse2(in);
402dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      fadst4_sse2(in);
40376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      write_buffer_4x4(output, in);
40447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org      break;
40576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org   default:
40676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     assert(0);
40776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     break;
40847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  }
40947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
41047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
41188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.orgvoid vp9_fdct8x8_1_sse2(const int16_t *input, int16_t *output, int stride) {
41288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
41388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
41488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
41588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
41688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  __m128i u0, u1, sum;
41788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
41888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  u0 = _mm_add_epi16(in0, in1);
41988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  u1 = _mm_add_epi16(in2, in3);
42088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
42188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0  = _mm_load_si128((const __m128i *)(input + 4 * stride));
42288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1  = _mm_load_si128((const __m128i *)(input + 5 * stride));
42388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in2  = _mm_load_si128((const __m128i *)(input + 6 * stride));
42488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in3  = _mm_load_si128((const __m128i *)(input + 7 * stride));
42588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
42688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  sum = _mm_add_epi16(u0, u1);
42788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
42888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_add_epi16(in0, in1);
42988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in2 = _mm_add_epi16(in2, in3);
43088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  sum = _mm_add_epi16(sum, in0);
43188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
43288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  u0  = _mm_setzero_si128();
43388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  sum = _mm_add_epi16(sum, in2);
43488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
43588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_unpacklo_epi16(u0, sum);
43688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1 = _mm_unpackhi_epi16(u0, sum);
43788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_srai_epi32(in0, 16);
43888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1 = _mm_srai_epi32(in1, 16);
43988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
44088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  sum = _mm_add_epi32(in0, in1);
44188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_unpacklo_epi32(sum, u0);
44288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1 = _mm_unpackhi_epi32(sum, u0);
44388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
44488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  sum = _mm_add_epi32(in0, in1);
44588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_srli_si128(sum, 8);
44688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
44788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1 = _mm_add_epi32(sum, in0);
44888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  _mm_store_si128((__m128i *)(output), in1);
44988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org}
45088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
451ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgvoid vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
4523f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  int pass;
4533f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // Constants
4543f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  //    When we use them, in one case, they are all the same. In all others
4553f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  //    it's a pair of them that we need to repeat four times. This is done
4563f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  //    by constructing the 32 bit constant corresponding to that pair.
4573f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
4583f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
4593f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
4603f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
4613f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
4623f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
4633f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
4643f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
4653f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
4663f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // Load input
46747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
46847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
46947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
47047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
47147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i in4  = _mm_load_si128((const __m128i *)(input + 4 * stride));
47247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i in5  = _mm_load_si128((const __m128i *)(input + 5 * stride));
47347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i in6  = _mm_load_si128((const __m128i *)(input + 6 * stride));
47447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i in7  = _mm_load_si128((const __m128i *)(input + 7 * stride));
4753f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // Pre-condition input (shift by two)
4763f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  in0 = _mm_slli_epi16(in0, 2);
4773f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  in1 = _mm_slli_epi16(in1, 2);
4783f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  in2 = _mm_slli_epi16(in2, 2);
4793f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  in3 = _mm_slli_epi16(in3, 2);
4803f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  in4 = _mm_slli_epi16(in4, 2);
4813f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  in5 = _mm_slli_epi16(in5, 2);
4823f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  in6 = _mm_slli_epi16(in6, 2);
4833f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  in7 = _mm_slli_epi16(in7, 2);
4843f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org
4853f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // We do two passes, first the columns, then the rows. The results of the
4863f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // first pass are transposed so that the same column code can be reused. The
4873f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // results of the second pass are also transposed so that the rows (processed
4883f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // as columns) are put back in row positions.
4893f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  for (pass = 0; pass < 2; pass++) {
4903f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    // To store results of each pass before the transpose.
4913f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
492411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // Add/subtract
4933f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i q0 = _mm_add_epi16(in0, in7);
4943f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i q1 = _mm_add_epi16(in1, in6);
4953f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i q2 = _mm_add_epi16(in2, in5);
4963f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i q3 = _mm_add_epi16(in3, in4);
4973f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i q4 = _mm_sub_epi16(in3, in4);
4983f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i q5 = _mm_sub_epi16(in2, in5);
4993f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i q6 = _mm_sub_epi16(in1, in6);
5003f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i q7 = _mm_sub_epi16(in0, in7);
5013f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    // Work on first four results
5023f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    {
503411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org      // Add/subtract
5043f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i r0 = _mm_add_epi16(q0, q3);
5053f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i r1 = _mm_add_epi16(q1, q2);
5063f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i r2 = _mm_sub_epi16(q1, q2);
5073f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i r3 = _mm_sub_epi16(q0, q3);
5083f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // Interleave to do the multiply by constants which gets us into 32bits
5093f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
5103f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
5113f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
5123f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
5133f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
5143f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
5153f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
5163f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
5173f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
5183f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
5193f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
5203f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
5213f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // dct_const_round_shift
5223f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
5233f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
5243f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
5253f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
5263f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
5273f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
5283f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
5293f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
5303f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
5313f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
5323f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
5333f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
5343f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
5353f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
5363f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
5373f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
5383f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // Combine
5393f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      res0 = _mm_packs_epi32(w0, w1);
5403f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      res4 = _mm_packs_epi32(w2, w3);
5413f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      res2 = _mm_packs_epi32(w4, w5);
5423f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      res6 = _mm_packs_epi32(w6, w7);
5433f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    }
5443f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    // Work on next four results
5453f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    {
5463f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // Interleave to do the multiply by constants which gets us into 32bits
5473f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
5483f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
5493f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
5503f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
5513f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
5523f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
5533f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // dct_const_round_shift
5543f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
5553f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
5563f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
5573f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
5583f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
5593f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
5603f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
5613f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
5623f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // Combine
5633f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i r0 = _mm_packs_epi32(s0, s1);
5643f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i r1 = _mm_packs_epi32(s2, s3);
565411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org      // Add/subtract
5663f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i x0 = _mm_add_epi16(q4, r0);
5673f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i x1 = _mm_sub_epi16(q4, r0);
5683f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i x2 = _mm_sub_epi16(q7, r1);
5693f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i x3 = _mm_add_epi16(q7, r1);
5703f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // Interleave to do the multiply by constants which gets us into 32bits
5713f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
5723f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
5733f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
5743f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
5753f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
5763f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
5773f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
5783f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
5793f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
5803f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
5813f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
5823f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
5833f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // dct_const_round_shift
5843f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
5853f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
5863f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
5873f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
5883f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
5893f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
5903f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
5913f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
5923f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
5933f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
5943f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
5953f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
5963f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
5973f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
5983f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
5993f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
6003f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // Combine
6013f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      res1 = _mm_packs_epi32(w0, w1);
6023f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      res7 = _mm_packs_epi32(w2, w3);
6033f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      res5 = _mm_packs_epi32(w4, w5);
6043f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      res3 = _mm_packs_epi32(w6, w7);
6053f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    }
6063f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    // Transpose the 8x8.
6073f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    {
6083f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 00 01 02 03 04 05 06 07
6093f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 10 11 12 13 14 15 16 17
6103f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 20 21 22 23 24 25 26 27
6113f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 30 31 32 33 34 35 36 37
6123f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 40 41 42 43 44 45 46 47
6133f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 50 51 52 53 54 55 56 57
6143f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 60 61 62 63 64 65 66 67
6153f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 70 71 72 73 74 75 76 77
6163f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
6173f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
6183f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
6193f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
6203f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
6213f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
6223f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
6233f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
6243f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 00 10 01 11 02 12 03 13
6253f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 20 30 21 31 22 32 23 33
6263f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 04 14 05 15 06 16 07 17
6273f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 24 34 25 35 26 36 27 37
6283f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 40 50 41 51 42 52 43 53
6293f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 60 70 61 71 62 72 63 73
6303f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 54 54 55 55 56 56 57 57
6313f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 64 74 65 75 66 76 67 77
6323f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
6333f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
6343f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
6353f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
6363f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
6373f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
6383f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
6393f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
6403f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 00 10 20 30 01 11 21 31
6413f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 40 50 60 70 41 51 61 71
6423f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 02 12 22 32 03 13 23 33
6433f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 42 52 62 72 43 53 63 73
6443f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 04 14 24 34 05 15 21 36
6453f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 44 54 64 74 45 55 61 76
6463f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 06 16 26 36 07 17 27 37
6473f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 46 56 66 76 47 57 67 77
6483f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
6493f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
6503f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
6513f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
6523f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
6533f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
6543f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
6553f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
6563f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 00 10 20 30 40 50 60 70
6573f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 01 11 21 31 41 51 61 71
6583f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 02 12 22 32 42 52 62 72
6593f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 03 13 23 33 43 53 63 73
6603f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 04 14 24 34 44 54 64 74
6613f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 05 15 25 35 45 55 65 75
6623f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 06 16 26 36 46 56 66 76
6633f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // 07 17 27 37 47 57 67 77
6643f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    }
6653f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  }
6663f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // Post-condition output and store it
6673f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  {
6683f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    // Post-condition (division by two)
6693f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    //    division of two 16 bits signed numbers using shifts
6703f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    //    n / 2 = (n - (n >> 15)) >> 1
6713f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
6723f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
6733f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
6743f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
6753f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
6763f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
6773f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
6783f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
6793f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in0 = _mm_sub_epi16(in0, sign_in0);
6803f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in1 = _mm_sub_epi16(in1, sign_in1);
6813f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in2 = _mm_sub_epi16(in2, sign_in2);
6823f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in3 = _mm_sub_epi16(in3, sign_in3);
6833f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in4 = _mm_sub_epi16(in4, sign_in4);
6843f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in5 = _mm_sub_epi16(in5, sign_in5);
6853f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in6 = _mm_sub_epi16(in6, sign_in6);
6863f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in7 = _mm_sub_epi16(in7, sign_in7);
6873f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in0 = _mm_srai_epi16(in0, 1);
6883f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in1 = _mm_srai_epi16(in1, 1);
6893f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in2 = _mm_srai_epi16(in2, 1);
6903f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in3 = _mm_srai_epi16(in3, 1);
6913f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in4 = _mm_srai_epi16(in4, 1);
6923f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in5 = _mm_srai_epi16(in5, 1);
6933f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in6 = _mm_srai_epi16(in6, 1);
6943f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in7 = _mm_srai_epi16(in7, 1);
6953f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    // store results
69647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    _mm_store_si128((__m128i *)(output + 0 * 8), in0);
69747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    _mm_store_si128((__m128i *)(output + 1 * 8), in1);
69847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    _mm_store_si128((__m128i *)(output + 2 * 8), in2);
69947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    _mm_store_si128((__m128i *)(output + 3 * 8), in3);
70047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    _mm_store_si128((__m128i *)(output + 4 * 8), in4);
70147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    _mm_store_si128((__m128i *)(output + 5 * 8), in5);
70247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    _mm_store_si128((__m128i *)(output + 6 * 8), in6);
70347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    _mm_store_si128((__m128i *)(output + 7 * 8), in7);
70447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  }
70547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
70647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
70747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org// load 8x8 array
708ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgstatic INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
709ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org                                   int stride) {
710ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * stride));
711ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * stride));
712ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * stride));
713ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * stride));
714ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * stride));
715ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * stride));
716ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * stride));
717ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * stride));
71847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
71947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[0] = _mm_slli_epi16(in[0], 2);
72047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[1] = _mm_slli_epi16(in[1], 2);
72147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[2] = _mm_slli_epi16(in[2], 2);
72247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[3] = _mm_slli_epi16(in[3], 2);
72347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[4] = _mm_slli_epi16(in[4], 2);
72447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[5] = _mm_slli_epi16(in[5], 2);
72547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[6] = _mm_slli_epi16(in[6], 2);
72647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[7] = _mm_slli_epi16(in[7], 2);
72747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
72847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
72947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org// right shift and rounding
73047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.orgstatic INLINE void right_shift_8x8(__m128i *res, int const bit) {
73147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i kOne = _mm_set1_epi16(1);
73247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const int bit_m02 = bit - 2;
73347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i sign0 = _mm_srai_epi16(res[0], 15);
73447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i sign1 = _mm_srai_epi16(res[1], 15);
73547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i sign2 = _mm_srai_epi16(res[2], 15);
73647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i sign3 = _mm_srai_epi16(res[3], 15);
73747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i sign4 = _mm_srai_epi16(res[4], 15);
73847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i sign5 = _mm_srai_epi16(res[5], 15);
73947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i sign6 = _mm_srai_epi16(res[6], 15);
74047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i sign7 = _mm_srai_epi16(res[7], 15);
74147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
74247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  if (bit_m02 >= 0) {
74347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02);
74447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    res[0] = _mm_add_epi16(res[0], k_const_rounding);
74547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    res[1] = _mm_add_epi16(res[1], k_const_rounding);
74647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    res[2] = _mm_add_epi16(res[2], k_const_rounding);
74747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    res[3] = _mm_add_epi16(res[3], k_const_rounding);
74847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    res[4] = _mm_add_epi16(res[4], k_const_rounding);
74947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    res[5] = _mm_add_epi16(res[5], k_const_rounding);
75047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    res[6] = _mm_add_epi16(res[6], k_const_rounding);
75147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    res[7] = _mm_add_epi16(res[7], k_const_rounding);
75247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  }
75347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
75447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[0] = _mm_sub_epi16(res[0], sign0);
75547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[1] = _mm_sub_epi16(res[1], sign1);
75647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[2] = _mm_sub_epi16(res[2], sign2);
75747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[3] = _mm_sub_epi16(res[3], sign3);
75847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[4] = _mm_sub_epi16(res[4], sign4);
75947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[5] = _mm_sub_epi16(res[5], sign5);
76047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[6] = _mm_sub_epi16(res[6], sign6);
76147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[7] = _mm_sub_epi16(res[7], sign7);
76247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
76347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[0] = _mm_srai_epi16(res[0], bit);
76447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[1] = _mm_srai_epi16(res[1], bit);
76547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[2] = _mm_srai_epi16(res[2], bit);
76647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[3] = _mm_srai_epi16(res[3], bit);
76747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[4] = _mm_srai_epi16(res[4], bit);
76847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[5] = _mm_srai_epi16(res[5], bit);
76947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[6] = _mm_srai_epi16(res[6], bit);
77047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  res[7] = _mm_srai_epi16(res[7], bit);
77147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
77247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
77347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org// write 8x8 array
77447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.orgstatic INLINE void write_buffer_8x8(int16_t *output, __m128i *res, int stride) {
77547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  _mm_store_si128((__m128i *)(output + 0 * stride), res[0]);
77647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  _mm_store_si128((__m128i *)(output + 1 * stride), res[1]);
77747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  _mm_store_si128((__m128i *)(output + 2 * stride), res[2]);
77847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  _mm_store_si128((__m128i *)(output + 3 * stride), res[3]);
77947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  _mm_store_si128((__m128i *)(output + 4 * stride), res[4]);
78047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  _mm_store_si128((__m128i *)(output + 5 * stride), res[5]);
78147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  _mm_store_si128((__m128i *)(output + 6 * stride), res[6]);
78247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  _mm_store_si128((__m128i *)(output + 7 * stride), res[7]);
78347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
78447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
785dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.orgvoid fdct8_sse2(__m128i *in) {
78647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // constants
78747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
78847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
78947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
79047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
79147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
79247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
79347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
79447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
79547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
79647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
79747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
79847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
79947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
80047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // stage 1
80147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s0 = _mm_add_epi16(in[0], in[7]);
80247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s1 = _mm_add_epi16(in[1], in[6]);
80347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s2 = _mm_add_epi16(in[2], in[5]);
80447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s3 = _mm_add_epi16(in[3], in[4]);
80547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s4 = _mm_sub_epi16(in[3], in[4]);
80647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s5 = _mm_sub_epi16(in[2], in[5]);
80747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s6 = _mm_sub_epi16(in[1], in[6]);
80847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s7 = _mm_sub_epi16(in[0], in[7]);
80947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
81047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u0 = _mm_add_epi16(s0, s3);
81147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u1 = _mm_add_epi16(s1, s2);
81247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u2 = _mm_sub_epi16(s1, s2);
81347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u3 = _mm_sub_epi16(s0, s3);
81447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // interleave and perform butterfly multiplication/addition
81547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v0 = _mm_unpacklo_epi16(u0, u1);
81647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v1 = _mm_unpackhi_epi16(u0, u1);
81747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v2 = _mm_unpacklo_epi16(u2, u3);
81847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v3 = _mm_unpackhi_epi16(u2, u3);
81947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
82047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
82147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
82247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
82347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
82447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
82547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
82647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
82747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
82847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
82947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // shift and rounding
83047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
83147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
83247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
83347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
83447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
83547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
83647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
83747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
83847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
83947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
84047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
84147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
84247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
84347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
84447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
84547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
84647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
84747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
84847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[0] = _mm_packs_epi32(u0, u1);
84947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[2] = _mm_packs_epi32(u4, u5);
85047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[4] = _mm_packs_epi32(u2, u3);
85147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[6] = _mm_packs_epi32(u6, u7);
85247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
85347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // stage 2
85447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // interleave and perform butterfly multiplication/addition
85547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u0 = _mm_unpacklo_epi16(s6, s5);
85647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u1 = _mm_unpackhi_epi16(s6, s5);
85747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
85847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
85947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
86047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
86147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
86247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // shift and rounding
86347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
86447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
86547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
86647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
86747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
86847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
86947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
87047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
87147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
87247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
87347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u0 = _mm_packs_epi32(v0, v1);
87447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u1 = _mm_packs_epi32(v2, v3);
87547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
87647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // stage 3
87747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s0 = _mm_add_epi16(s4, u0);
87847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s1 = _mm_sub_epi16(s4, u0);
87947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s2 = _mm_sub_epi16(s7, u1);
88047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s3 = _mm_add_epi16(s7, u1);
88147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
88247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // stage 4
88347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u0 = _mm_unpacklo_epi16(s0, s3);
88447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u1 = _mm_unpackhi_epi16(s0, s3);
88547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u2 = _mm_unpacklo_epi16(s1, s2);
88647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u3 = _mm_unpackhi_epi16(s1, s2);
88747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
88847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
88947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
89047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
89147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
89247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
89347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
89447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
89547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
89647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
89747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // shift and rounding
89847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
89947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
90047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
90147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
90247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
90347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
90447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
90547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
90647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
90747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
90847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
90947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
91047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
91147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
91247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
91347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
91447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
91547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
91647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[1] = _mm_packs_epi32(v0, v1);
91747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[3] = _mm_packs_epi32(v4, v5);
91847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[5] = _mm_packs_epi32(v2, v3);
91947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[7] = _mm_packs_epi32(v6, v7);
92047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
92147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // transpose
92247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  array_transpose_8x8(in, in);
92347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
92447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
925dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.orgvoid fadst8_sse2(__m128i *in) {
92647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // Constants
92747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
92847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
92947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
93047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
93147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
93247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
93347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
93447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
93547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
93647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
93747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
93847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
93947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
94047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__const_0 = _mm_set1_epi16(0);
94147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
94247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
94347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
94447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
94547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
94647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
94747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
94847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
94947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // properly aligned for butterfly input
95047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in0  = in[7];
95147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in1  = in[0];
95247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in2  = in[5];
95347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in3  = in[2];
95447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in4  = in[3];
95547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in5  = in[4];
95647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in6  = in[1];
95747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in7  = in[6];
95847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
95947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // column transformation
96047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // stage 1
96147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // interleave and multiply/add into 32-bit integer
96247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s0 = _mm_unpacklo_epi16(in0, in1);
96347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s1 = _mm_unpackhi_epi16(in0, in1);
96447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s2 = _mm_unpacklo_epi16(in2, in3);
96547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s3 = _mm_unpackhi_epi16(in2, in3);
96647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s4 = _mm_unpacklo_epi16(in4, in5);
96747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s5 = _mm_unpackhi_epi16(in4, in5);
96847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s6 = _mm_unpacklo_epi16(in6, in7);
96947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s7 = _mm_unpackhi_epi16(in6, in7);
97047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
97147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
97247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
97347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
97447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
97547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
97647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
97747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
97847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
97947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
98047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
98147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
98247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
98347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
98447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
98547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
98647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
98747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
98847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // addition
98947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w0 = _mm_add_epi32(u0, u8);
99047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w1 = _mm_add_epi32(u1, u9);
99147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w2 = _mm_add_epi32(u2, u10);
99247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w3 = _mm_add_epi32(u3, u11);
99347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w4 = _mm_add_epi32(u4, u12);
99447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w5 = _mm_add_epi32(u5, u13);
99547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w6 = _mm_add_epi32(u6, u14);
99647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w7 = _mm_add_epi32(u7, u15);
99747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w8 = _mm_sub_epi32(u0, u8);
99847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w9 = _mm_sub_epi32(u1, u9);
99947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w10 = _mm_sub_epi32(u2, u10);
100047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w11 = _mm_sub_epi32(u3, u11);
100147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w12 = _mm_sub_epi32(u4, u12);
100247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w13 = _mm_sub_epi32(u5, u13);
100347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w14 = _mm_sub_epi32(u6, u14);
100447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w15 = _mm_sub_epi32(u7, u15);
100547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
100647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // shift and rounding
100747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
100847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
100947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
101047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
101147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
101247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
101347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
101447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
101547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
101647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
101747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
101847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
101947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
102047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
102147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
102247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
102347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
102447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
102547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
102647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
102747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
102847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
102947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
103047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
103147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
103247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
103347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
103447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
103547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
103647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
103747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
103847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
103947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
104047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
104147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // back to 16-bit and pack 8 integers into __m128i
104247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[0] = _mm_packs_epi32(u0, u1);
104347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[1] = _mm_packs_epi32(u2, u3);
104447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[2] = _mm_packs_epi32(u4, u5);
104547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[3] = _mm_packs_epi32(u6, u7);
104647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[4] = _mm_packs_epi32(u8, u9);
104747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[5] = _mm_packs_epi32(u10, u11);
104847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[6] = _mm_packs_epi32(u12, u13);
104947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[7] = _mm_packs_epi32(u14, u15);
105047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
105147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // stage 2
105247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s0 = _mm_add_epi16(in[0], in[2]);
105347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s1 = _mm_add_epi16(in[1], in[3]);
105447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s2 = _mm_sub_epi16(in[0], in[2]);
105547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s3 = _mm_sub_epi16(in[1], in[3]);
105647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u0 = _mm_unpacklo_epi16(in[4], in[5]);
105747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u1 = _mm_unpackhi_epi16(in[4], in[5]);
105847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u2 = _mm_unpacklo_epi16(in[6], in[7]);
105947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u3 = _mm_unpackhi_epi16(in[6], in[7]);
106047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
106147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
106247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
106347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
106447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
106547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
106647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
106747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
106847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
106947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
107047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w0 = _mm_add_epi32(v0, v4);
107147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w1 = _mm_add_epi32(v1, v5);
107247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w2 = _mm_add_epi32(v2, v6);
107347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w3 = _mm_add_epi32(v3, v7);
107447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w4 = _mm_sub_epi32(v0, v4);
107547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w5 = _mm_sub_epi32(v1, v5);
107647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w6 = _mm_sub_epi32(v2, v6);
107747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  w7 = _mm_sub_epi32(v3, v7);
107847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
107947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
108047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
108147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
108247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
108347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
108447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
108547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
108647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
108747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
108847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
108947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
109047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
109147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
109247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
109347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
109447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
109547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
109647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
109747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // back to 16-bit intergers
109847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s4 = _mm_packs_epi32(u0, u1);
109947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s5 = _mm_packs_epi32(u2, u3);
110047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s6 = _mm_packs_epi32(u4, u5);
110147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s7 = _mm_packs_epi32(u6, u7);
110247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
110347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // stage 3
110447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u0 = _mm_unpacklo_epi16(s2, s3);
110547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u1 = _mm_unpackhi_epi16(s2, s3);
110647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u2 = _mm_unpacklo_epi16(s6, s7);
110747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u3 = _mm_unpackhi_epi16(s6, s7);
110847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
110947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
111047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
111147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
111247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
111347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
111447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
111547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
111647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
111747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
111847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
111947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
112047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
112147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
112247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
112347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
112447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
112547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
112647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
112747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
112847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
112947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
113047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
113147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
113247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
113347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
113447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
113547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
113647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s2 = _mm_packs_epi32(v0, v1);
113747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s3 = _mm_packs_epi32(v2, v3);
113847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s6 = _mm_packs_epi32(v4, v5);
113947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s7 = _mm_packs_epi32(v6, v7);
114047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
114147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // FIXME(jingning): do subtract using bit inversion?
114247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[0] = s0;
114347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[1] = _mm_sub_epi16(k__const_0, s4);
114447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[2] = s6;
114547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[3] = _mm_sub_epi16(k__const_0, s2);
114647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[4] = s3;
114747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[5] = _mm_sub_epi16(k__const_0, s7);
114847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[6] = s5;
114947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[7] = _mm_sub_epi16(k__const_0, s1);
115047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
115147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // transpose
115247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  array_transpose_8x8(in, in);
115347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
115447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
115576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.orgvoid vp9_fht8x8_sse2(const int16_t *input, int16_t *output,
115676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                     int stride, int tx_type) {
115747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i in[8];
115876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
115947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  switch (tx_type) {
116076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    case DCT_DCT:
116176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      vp9_fdct8x8_sse2(input, output, stride);
116247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org      break;
116376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    case ADST_DCT:
116476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      load_buffer_8x8(input, in, stride);
1165dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      fadst8_sse2(in);
1166dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      fdct8_sse2(in);
116776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      right_shift_8x8(in, 1);
116876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      write_buffer_8x8(output, in, 8);
116947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org      break;
117076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    case DCT_ADST:
117176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      load_buffer_8x8(input, in, stride);
1172dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      fdct8_sse2(in);
1173dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      fadst8_sse2(in);
117476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      right_shift_8x8(in, 1);
117576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      write_buffer_8x8(output, in, 8);
117647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org      break;
117776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    case ADST_ADST:
117876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      load_buffer_8x8(input, in, stride);
1179dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      fadst8_sse2(in);
1180dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      fadst8_sse2(in);
118176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      right_shift_8x8(in, 1);
118276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      write_buffer_8x8(output, in, 8);
118347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org      break;
118447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    default:
118547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org      assert(0);
118647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org      break;
11873f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  }
11883f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org}
11893f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org
119088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.orgvoid vp9_fdct16x16_1_sse2(const int16_t *input, int16_t *output, int stride) {
119188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  __m128i in0, in1, in2, in3;
119288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  __m128i u0, u1;
119388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  __m128i sum = _mm_setzero_si128();
119488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  int i;
119588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
119688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  for (i = 0; i < 2; ++i) {
119788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    input += 8 * i;
119888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in0  = _mm_load_si128((const __m128i *)(input +  0 * stride));
119988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in1  = _mm_load_si128((const __m128i *)(input +  1 * stride));
120088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in2  = _mm_load_si128((const __m128i *)(input +  2 * stride));
120188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in3  = _mm_load_si128((const __m128i *)(input +  3 * stride));
120288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
120388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    u0 = _mm_add_epi16(in0, in1);
120488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    u1 = _mm_add_epi16(in2, in3);
120588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    sum = _mm_add_epi16(sum, u0);
120688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
120788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in0  = _mm_load_si128((const __m128i *)(input +  4 * stride));
120888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in1  = _mm_load_si128((const __m128i *)(input +  5 * stride));
120988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in2  = _mm_load_si128((const __m128i *)(input +  6 * stride));
121088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in3  = _mm_load_si128((const __m128i *)(input +  7 * stride));
121188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
121288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    sum = _mm_add_epi16(sum, u1);
121388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    u0  = _mm_add_epi16(in0, in1);
121488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    u1  = _mm_add_epi16(in2, in3);
121588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    sum = _mm_add_epi16(sum, u0);
121688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
121788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in0  = _mm_load_si128((const __m128i *)(input +  8 * stride));
121888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in1  = _mm_load_si128((const __m128i *)(input +  9 * stride));
121988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in2  = _mm_load_si128((const __m128i *)(input + 10 * stride));
122088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in3  = _mm_load_si128((const __m128i *)(input + 11 * stride));
122188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
122288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    sum = _mm_add_epi16(sum, u1);
122388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    u0  = _mm_add_epi16(in0, in1);
122488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    u1  = _mm_add_epi16(in2, in3);
122588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    sum = _mm_add_epi16(sum, u0);
122688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
122788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in0  = _mm_load_si128((const __m128i *)(input + 12 * stride));
122888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in1  = _mm_load_si128((const __m128i *)(input + 13 * stride));
122988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in2  = _mm_load_si128((const __m128i *)(input + 14 * stride));
123088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in3  = _mm_load_si128((const __m128i *)(input + 15 * stride));
123188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
123288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    sum = _mm_add_epi16(sum, u1);
123388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    u0  = _mm_add_epi16(in0, in1);
123488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    u1  = _mm_add_epi16(in2, in3);
123588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    sum = _mm_add_epi16(sum, u0);
123688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
123788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    sum = _mm_add_epi16(sum, u1);
123888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  }
123988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
124088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  u0  = _mm_setzero_si128();
124188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_unpacklo_epi16(u0, sum);
124288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1 = _mm_unpackhi_epi16(u0, sum);
124388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_srai_epi32(in0, 16);
124488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1 = _mm_srai_epi32(in1, 16);
124588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
124688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  sum = _mm_add_epi32(in0, in1);
124788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_unpacklo_epi32(sum, u0);
124888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1 = _mm_unpackhi_epi32(sum, u0);
124988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
125088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  sum = _mm_add_epi32(in0, in1);
125188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_srli_si128(sum, 8);
125288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
125388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1 = _mm_add_epi32(sum, in0);
125488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1 = _mm_srai_epi32(in1, 1);
125588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  _mm_store_si128((__m128i *)(output), in1);
125688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org}
125788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
1258ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgvoid vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
12593f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // The 2D transform is done with two passes which are actually pretty
12603f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // similar. In the first one, we transform the columns and transpose
12613f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // the results. In the second one, we transform the rows. To achieve that,
1262411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // as the first pass results are transposed, we transpose the columns (that
12633f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // is the transposed rows) and transpose the results (so that it goes back
12643f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // in normal/row positions).
12653f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  int pass;
12663f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // We need an intermediate buffer between passes.
126747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);
1268ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  const int16_t *in = input;
12693f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  int16_t *out = intermediate;
12703f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // Constants
12713f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  //    When we use them, in one case, they are all the same. In all others
12723f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  //    it's a pair of them that we need to repeat four times. This is done
12733f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  //    by constructing the 32 bit constant corresponding to that pair.
12743f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
12753f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
12763f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
127788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
12783f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
12793f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
12803f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
12813f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
12823f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
12833f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
12843f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
12853f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
12863f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
12873f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
12883f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
12893f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
12903f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
12913f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
12923f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  const __m128i kOne = _mm_set1_epi16(1);
12933f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  // Do the two transform/transpose passes
12943f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  for (pass = 0; pass < 2; ++pass) {
12953f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    // We process eight columns (transposed rows in second pass) at a time.
12963f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    int column_start;
12973f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    for (column_start = 0; column_start < 16; column_start += 8) {
12983f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      __m128i in00, in01, in02, in03, in04, in05, in06, in07;
12993f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      __m128i in08, in09, in10, in11, in12, in13, in14, in15;
13003f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      __m128i input0, input1, input2, input3, input4, input5, input6, input7;
13013f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      __m128i step1_0, step1_1, step1_2, step1_3;
13023f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      __m128i step1_4, step1_5, step1_6, step1_7;
13033f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
13043f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      __m128i step3_0, step3_1, step3_2, step3_3;
13053f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      __m128i step3_4, step3_5, step3_6, step3_7;
13063f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      __m128i res00, res01, res02, res03, res04, res05, res06, res07;
13073f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      __m128i res08, res09, res10, res11, res12, res13, res14, res15;
13083f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // Load and pre-condition input.
13093f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      if (0 == pass) {
131047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in00  = _mm_load_si128((const __m128i *)(in +  0 * stride));
131147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in01  = _mm_load_si128((const __m128i *)(in +  1 * stride));
131247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in02  = _mm_load_si128((const __m128i *)(in +  2 * stride));
131347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in03  = _mm_load_si128((const __m128i *)(in +  3 * stride));
131447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in04  = _mm_load_si128((const __m128i *)(in +  4 * stride));
131547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in05  = _mm_load_si128((const __m128i *)(in +  5 * stride));
131647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in06  = _mm_load_si128((const __m128i *)(in +  6 * stride));
131747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in07  = _mm_load_si128((const __m128i *)(in +  7 * stride));
131847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in08  = _mm_load_si128((const __m128i *)(in +  8 * stride));
131947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in09  = _mm_load_si128((const __m128i *)(in +  9 * stride));
132047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in10  = _mm_load_si128((const __m128i *)(in + 10 * stride));
132147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in11  = _mm_load_si128((const __m128i *)(in + 11 * stride));
132247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in12  = _mm_load_si128((const __m128i *)(in + 12 * stride));
132347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in13  = _mm_load_si128((const __m128i *)(in + 13 * stride));
132447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in14  = _mm_load_si128((const __m128i *)(in + 14 * stride));
132547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in15  = _mm_load_si128((const __m128i *)(in + 15 * stride));
13263f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // x = x << 2
13273f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in00 = _mm_slli_epi16(in00, 2);
13283f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in01 = _mm_slli_epi16(in01, 2);
13293f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in02 = _mm_slli_epi16(in02, 2);
13303f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in03 = _mm_slli_epi16(in03, 2);
13313f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in04 = _mm_slli_epi16(in04, 2);
13323f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in05 = _mm_slli_epi16(in05, 2);
13333f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in06 = _mm_slli_epi16(in06, 2);
13343f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in07 = _mm_slli_epi16(in07, 2);
13353f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in08 = _mm_slli_epi16(in08, 2);
13363f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in09 = _mm_slli_epi16(in09, 2);
13373f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in10 = _mm_slli_epi16(in10, 2);
13383f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in11 = _mm_slli_epi16(in11, 2);
13393f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in12 = _mm_slli_epi16(in12, 2);
13403f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in13 = _mm_slli_epi16(in13, 2);
13413f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in14 = _mm_slli_epi16(in14, 2);
13423f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in15 = _mm_slli_epi16(in15, 2);
13433f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      } else {
134447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in00  = _mm_load_si128((const __m128i *)(in +  0 * 16));
134547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in01  = _mm_load_si128((const __m128i *)(in +  1 * 16));
134647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in02  = _mm_load_si128((const __m128i *)(in +  2 * 16));
134747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in03  = _mm_load_si128((const __m128i *)(in +  3 * 16));
134847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in04  = _mm_load_si128((const __m128i *)(in +  4 * 16));
134947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in05  = _mm_load_si128((const __m128i *)(in +  5 * 16));
135047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in06  = _mm_load_si128((const __m128i *)(in +  6 * 16));
135147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in07  = _mm_load_si128((const __m128i *)(in +  7 * 16));
135247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in08  = _mm_load_si128((const __m128i *)(in +  8 * 16));
135347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in09  = _mm_load_si128((const __m128i *)(in +  9 * 16));
135447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in10  = _mm_load_si128((const __m128i *)(in + 10 * 16));
135547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in11  = _mm_load_si128((const __m128i *)(in + 11 * 16));
135647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in12  = _mm_load_si128((const __m128i *)(in + 12 * 16));
135747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in13  = _mm_load_si128((const __m128i *)(in + 13 * 16));
135847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in14  = _mm_load_si128((const __m128i *)(in + 14 * 16));
135947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        in15  = _mm_load_si128((const __m128i *)(in + 15 * 16));
13603f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // x = (x + 1) >> 2
13613f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in00 = _mm_add_epi16(in00, kOne);
13623f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in01 = _mm_add_epi16(in01, kOne);
13633f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in02 = _mm_add_epi16(in02, kOne);
13643f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in03 = _mm_add_epi16(in03, kOne);
13653f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in04 = _mm_add_epi16(in04, kOne);
13663f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in05 = _mm_add_epi16(in05, kOne);
13673f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in06 = _mm_add_epi16(in06, kOne);
13683f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in07 = _mm_add_epi16(in07, kOne);
13693f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in08 = _mm_add_epi16(in08, kOne);
13703f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in09 = _mm_add_epi16(in09, kOne);
13713f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in10 = _mm_add_epi16(in10, kOne);
13723f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in11 = _mm_add_epi16(in11, kOne);
13733f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in12 = _mm_add_epi16(in12, kOne);
13743f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in13 = _mm_add_epi16(in13, kOne);
13753f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in14 = _mm_add_epi16(in14, kOne);
13763f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in15 = _mm_add_epi16(in15, kOne);
13773f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in00 = _mm_srai_epi16(in00, 2);
13783f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in01 = _mm_srai_epi16(in01, 2);
13793f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in02 = _mm_srai_epi16(in02, 2);
13803f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in03 = _mm_srai_epi16(in03, 2);
13813f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in04 = _mm_srai_epi16(in04, 2);
13823f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in05 = _mm_srai_epi16(in05, 2);
13833f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in06 = _mm_srai_epi16(in06, 2);
13843f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in07 = _mm_srai_epi16(in07, 2);
13853f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in08 = _mm_srai_epi16(in08, 2);
13863f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in09 = _mm_srai_epi16(in09, 2);
13873f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in10 = _mm_srai_epi16(in10, 2);
13883f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in11 = _mm_srai_epi16(in11, 2);
13893f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in12 = _mm_srai_epi16(in12, 2);
13903f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in13 = _mm_srai_epi16(in13, 2);
13913f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in14 = _mm_srai_epi16(in14, 2);
13923f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        in15 = _mm_srai_epi16(in15, 2);
13933f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      }
13943f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      in += 8;
13953f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // Calculate input for the first 8 results.
13963f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      {
13973f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        input0 = _mm_add_epi16(in00, in15);
13983f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        input1 = _mm_add_epi16(in01, in14);
13993f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        input2 = _mm_add_epi16(in02, in13);
14003f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        input3 = _mm_add_epi16(in03, in12);
14013f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        input4 = _mm_add_epi16(in04, in11);
14023f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        input5 = _mm_add_epi16(in05, in10);
14033f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        input6 = _mm_add_epi16(in06, in09);
14043f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        input7 = _mm_add_epi16(in07, in08);
14053f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      }
14063f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // Calculate input for the next 8 results.
14073f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      {
14083f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        step1_0 = _mm_sub_epi16(in07, in08);
14093f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        step1_1 = _mm_sub_epi16(in06, in09);
14103f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        step1_2 = _mm_sub_epi16(in05, in10);
14113f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        step1_3 = _mm_sub_epi16(in04, in11);
14123f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        step1_4 = _mm_sub_epi16(in03, in12);
14133f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        step1_5 = _mm_sub_epi16(in02, in13);
14143f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        step1_6 = _mm_sub_epi16(in01, in14);
14153f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        step1_7 = _mm_sub_epi16(in00, in15);
14163f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      }
1417dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      // Work on the first eight values; fdct8(input, even_results);
14183f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      {
1419411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org        // Add/subtract
14203f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i q0 = _mm_add_epi16(input0, input7);
14213f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i q1 = _mm_add_epi16(input1, input6);
14223f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i q2 = _mm_add_epi16(input2, input5);
14233f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i q3 = _mm_add_epi16(input3, input4);
14243f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i q4 = _mm_sub_epi16(input3, input4);
14253f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i q5 = _mm_sub_epi16(input2, input5);
14263f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i q6 = _mm_sub_epi16(input1, input6);
14273f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i q7 = _mm_sub_epi16(input0, input7);
14283f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // Work on first four results
14293f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        {
1430411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org          // Add/subtract
14313f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i r0 = _mm_add_epi16(q0, q3);
14323f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i r1 = _mm_add_epi16(q1, q2);
14333f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i r2 = _mm_sub_epi16(q1, q2);
14343f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i r3 = _mm_sub_epi16(q0, q3);
14353f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // Interleave to do the multiply by constants which gets us
14363f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // into 32 bits.
14373f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
14383f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
14393f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
14403f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
14413f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
14423f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
14433f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
14443f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
14453f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
14463f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
14473f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
14483f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
14493f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // dct_const_round_shift
14503f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
14513f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
14523f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
14533f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
14543f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
14553f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
14563f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
14573f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
14583f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
14593f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
14603f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
14613f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
14623f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
14633f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
14643f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
14653f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
14663f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // Combine
14673f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res00 = _mm_packs_epi32(w0, w1);
14683f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res08 = _mm_packs_epi32(w2, w3);
14693f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res04 = _mm_packs_epi32(w4, w5);
14703f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res12 = _mm_packs_epi32(w6, w7);
14713f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        }
14723f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // Work on next four results
14733f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        {
14743f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // Interleave to do the multiply by constants which gets us
14753f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // into 32 bits.
14763f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
14773f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
14783f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
14793f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
14803f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
14813f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
14823f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // dct_const_round_shift
14833f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
14843f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
14853f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
14863f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
14873f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
14883f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
14893f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
14903f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
14913f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // Combine
14923f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i r0 = _mm_packs_epi32(s0, s1);
14933f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i r1 = _mm_packs_epi32(s2, s3);
1494411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org          // Add/subtract
14953f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i x0 = _mm_add_epi16(q4, r0);
14963f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i x1 = _mm_sub_epi16(q4, r0);
14973f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i x2 = _mm_sub_epi16(q7, r1);
14983f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i x3 = _mm_add_epi16(q7, r1);
14993f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // Interleave to do the multiply by constants which gets us
15003f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // into 32 bits.
15013f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
15023f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
15033f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
15043f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
15053f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
15063f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
15073f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
15083f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
15093f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
15103f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
15113f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
15123f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
15133f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // dct_const_round_shift
15143f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
15153f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
15163f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
15173f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
15183f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
15193f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
15203f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
15213f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
15223f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
15233f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
15243f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
15253f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
15263f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
15273f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
15283f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
15293f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
15303f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // Combine
15313f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res02 = _mm_packs_epi32(w0, w1);
15323f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res14 = _mm_packs_epi32(w2, w3);
15333f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res10 = _mm_packs_epi32(w4, w5);
15343f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res06 = _mm_packs_epi32(w6, w7);
15353f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        }
15363f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      }
15373f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // Work on the next eight values; step1 -> odd_results
15383f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      {
15393f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // step 2
15403f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        {
15413f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
15423f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
15433f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
15443f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
15453f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16);
15463f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16);
15473f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16);
15483f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16);
15493f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // dct_const_round_shift
15503f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
15513f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
15523f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
15533f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
15543f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
15553f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
15563f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
15573f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
15583f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // Combine
15593f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step2_2 = _mm_packs_epi32(w0, w1);
15603f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step2_3 = _mm_packs_epi32(w2, w3);
15613f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        }
15623f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        {
15633f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
15643f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
15653f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
15663f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
15673f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
15683f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
15693f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16);
15703f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16);
15713f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // dct_const_round_shift
15723f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
15733f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
15743f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
15753f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
15763f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
15773f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
15783f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
15793f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
15803f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // Combine
15813f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step2_5 = _mm_packs_epi32(w0, w1);
15823f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step2_4 = _mm_packs_epi32(w2, w3);
15833f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        }
15843f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // step 3
15853f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        {
15863f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step3_0 = _mm_add_epi16(step1_0, step2_3);
15873f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step3_1 = _mm_add_epi16(step1_1, step2_2);
15883f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step3_2 = _mm_sub_epi16(step1_1, step2_2);
15893f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step3_3 = _mm_sub_epi16(step1_0, step2_3);
15903f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step3_4 = _mm_sub_epi16(step1_7, step2_4);
15913f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step3_5 = _mm_sub_epi16(step1_6, step2_5);
15923f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step3_6 = _mm_add_epi16(step1_6, step2_5);
15933f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step3_7 = _mm_add_epi16(step1_7, step2_4);
15943f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        }
15953f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // step 4
15963f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        {
15973f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
15983f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
15993f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
16003f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
16013f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24);
16023f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24);
160388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p24_p08);
160488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p24_p08);
16053f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // dct_const_round_shift
16063f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
16073f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
16083f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
16093f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
16103f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
16113f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
16123f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
16133f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
16143f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // Combine
16153f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step2_1 = _mm_packs_epi32(w0, w1);
16163f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step2_2 = _mm_packs_epi32(w2, w3);
16173f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        }
16183f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        {
16193f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
16203f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
16213f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
16223f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
16233f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08);
16243f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08);
162588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p08_m24);
162688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p08_m24);
16273f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // dct_const_round_shift
16283f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
16293f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
16303f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
16313f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
16323f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
16333f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
16343f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
16353f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
16363f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // Combine
16373f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step2_6 = _mm_packs_epi32(w0, w1);
16383f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step2_5 = _mm_packs_epi32(w2, w3);
16393f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        }
16403f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // step 5
16413f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        {
16423f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step1_0 = _mm_add_epi16(step3_0, step2_1);
16433f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step1_1 = _mm_sub_epi16(step3_0, step2_1);
164488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org          step1_2 = _mm_add_epi16(step3_3, step2_2);
164588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org          step1_3 = _mm_sub_epi16(step3_3, step2_2);
164688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org          step1_4 = _mm_sub_epi16(step3_4, step2_5);
164788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org          step1_5 = _mm_add_epi16(step3_4, step2_5);
16483f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step1_6 = _mm_sub_epi16(step3_7, step2_6);
16493f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          step1_7 = _mm_add_epi16(step3_7, step2_6);
16503f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        }
16513f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // step 6
16523f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        {
16533f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
16543f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
16553f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
16563f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
16573f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02);
16583f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02);
16593f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18);
16603f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18);
16613f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // dct_const_round_shift
16623f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
16633f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
16643f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
16653f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
16663f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
16673f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
16683f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
16693f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
16703f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // Combine
16713f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res01 = _mm_packs_epi32(w0, w1);
16723f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res09 = _mm_packs_epi32(w2, w3);
16733f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        }
16743f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        {
16753f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
16763f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
16773f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
16783f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
16793f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10);
16803f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10);
16813f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26);
16823f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26);
16833f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // dct_const_round_shift
16843f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
16853f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
16863f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
16873f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
16883f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
16893f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
16903f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
16913f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
16923f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // Combine
16933f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res05 = _mm_packs_epi32(w0, w1);
16943f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res13 = _mm_packs_epi32(w2, w3);
16953f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        }
16963f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        {
16973f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
16983f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
16993f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
17003f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
17013f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22);
17023f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22);
17033f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06);
17043f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06);
17053f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // dct_const_round_shift
17063f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
17073f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
17083f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
17093f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
17103f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
17113f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
17123f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
17133f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
17143f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // Combine
17153f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res11 = _mm_packs_epi32(w0, w1);
17163f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res03 = _mm_packs_epi32(w2, w3);
17173f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        }
17183f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        {
17193f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
17203f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
17213f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
17223f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
17233f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30);
17243f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30);
17253f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14);
17263f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14);
17273f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // dct_const_round_shift
17283f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
17293f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
17303f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
17313f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
17323f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
17333f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
17343f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
17353f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
17363f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          // Combine
17373f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res15 = _mm_packs_epi32(w0, w1);
17383f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org          res07 = _mm_packs_epi32(w2, w3);
17393f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        }
17403f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      }
17413f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      // Transpose the results, do it as two 8x8 transposes.
17423f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      {
17433f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 00 01 02 03 04 05 06 07
17443f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 10 11 12 13 14 15 16 17
17453f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 20 21 22 23 24 25 26 27
17463f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 30 31 32 33 34 35 36 37
17473f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 40 41 42 43 44 45 46 47
17483f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 50 51 52 53 54 55 56 57
17493f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 60 61 62 63 64 65 66 67
17503f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 70 71 72 73 74 75 76 77
17513f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01);
17523f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03);
17533f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01);
17543f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03);
17553f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05);
17563f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07);
17573f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05);
17583f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07);
17593f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 00 10 01 11 02 12 03 13
17603f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 20 30 21 31 22 32 23 33
17613f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 04 14 05 15 06 16 07 17
17623f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 24 34 25 35 26 36 27 37
17633f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 40 50 41 51 42 52 43 53
17643f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 60 70 61 71 62 72 63 73
17653f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 54 54 55 55 56 56 57 57
17663f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 64 74 65 75 66 76 67 77
17673f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
17683f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
17693f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
17703f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
17713f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
17723f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
17733f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
17743f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
17753f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 00 10 20 30 01 11 21 31
17763f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 40 50 60 70 41 51 61 71
17773f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 02 12 22 32 03 13 23 33
17783f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 42 52 62 72 43 53 63 73
17793f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 04 14 24 34 05 15 21 36
17803f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 44 54 64 74 45 55 61 76
17813f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 06 16 26 36 07 17 27 37
17823f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 46 56 66 76 47 57 67 77
17833f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
17843f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
17853f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
17863f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
17873f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
17883f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
17893f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
17903f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
17913f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 00 10 20 30 40 50 60 70
17923f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 01 11 21 31 41 51 61 71
17933f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 02 12 22 32 42 52 62 72
17943f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 03 13 23 33 43 53 63 73
17953f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 04 14 24 34 44 54 64 74
17963f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 05 15 25 35 45 55 65 75
17973f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 06 16 26 36 46 56 66 76
17983f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 07 17 27 37 47 57 67 77
179910a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org        _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0);
180010a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org        _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1);
180110a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org        _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2);
180210a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org        _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3);
180310a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org        _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4);
180410a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org        _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5);
180510a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org        _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6);
180610a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org        _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7);
18073f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      }
18083f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      {
18093f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 00 01 02 03 04 05 06 07
18103f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 10 11 12 13 14 15 16 17
18113f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 20 21 22 23 24 25 26 27
18123f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 30 31 32 33 34 35 36 37
18133f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 40 41 42 43 44 45 46 47
18143f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 50 51 52 53 54 55 56 57
18153f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 60 61 62 63 64 65 66 67
18163f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 70 71 72 73 74 75 76 77
18173f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09);
18183f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11);
18193f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09);
18203f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11);
18213f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13);
18223f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15);
18233f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13);
18243f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15);
18253f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 00 10 01 11 02 12 03 13
18263f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 20 30 21 31 22 32 23 33
18273f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 04 14 05 15 06 16 07 17
18283f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 24 34 25 35 26 36 27 37
18293f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 40 50 41 51 42 52 43 53
18303f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 60 70 61 71 62 72 63 73
18313f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 54 54 55 55 56 56 57 57
18323f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 64 74 65 75 66 76 67 77
18333f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
18343f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
18353f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
18363f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
18373f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
18383f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
18393f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
18403f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
18413f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 00 10 20 30 01 11 21 31
18423f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 40 50 60 70 41 51 61 71
18433f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 02 12 22 32 03 13 23 33
18443f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 42 52 62 72 43 53 63 73
18453f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 04 14 24 34 05 15 21 36
18463f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 44 54 64 74 45 55 61 76
18473f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 06 16 26 36 07 17 27 37
18483f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 46 56 66 76 47 57 67 77
18493f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
18503f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
18513f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
18523f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
18533f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
18543f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
18553f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
18563f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
18573f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 00 10 20 30 40 50 60 70
18583f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 01 11 21 31 41 51 61 71
18593f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 02 12 22 32 42 52 62 72
18603f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 03 13 23 33 43 53 63 73
18613f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 04 14 24 34 44 54 64 74
18623f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 05 15 25 35 45 55 65 75
18633f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 06 16 26 36 46 56 66 76
18643f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // 07 17 27 37 47 57 67 77
18653f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org        // Store results
186647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        _mm_store_si128((__m128i *)(out + 8 + 0 * 16), tr2_0);
186747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        _mm_store_si128((__m128i *)(out + 8 + 1 * 16), tr2_1);
186847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        _mm_store_si128((__m128i *)(out + 8 + 2 * 16), tr2_2);
186947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        _mm_store_si128((__m128i *)(out + 8 + 3 * 16), tr2_3);
187047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        _mm_store_si128((__m128i *)(out + 8 + 4 * 16), tr2_4);
187147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        _mm_store_si128((__m128i *)(out + 8 + 5 * 16), tr2_5);
187247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        _mm_store_si128((__m128i *)(out + 8 + 6 * 16), tr2_6);
187347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org        _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7);
18743f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      }
18753f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org      out += 8*16;
18763f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    }
18773f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    // Setup in/out for next pass.
18783f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    in = intermediate;
18793f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org    out = output;
18803f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org  }
18813f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org}
188247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
1883ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgstatic INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,
188447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org                                     __m128i *in1, int stride) {
188547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // load first 8 columns
188647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  load_buffer_8x8(input, in0, stride);
188747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
188847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
188947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  input += 8;
189047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // load second 8 columns
189147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  load_buffer_8x8(input, in1, stride);
189247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
189347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
189447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
189547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.orgstatic INLINE void write_buffer_16x16(int16_t *output, __m128i *in0,
189647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org                                      __m128i *in1, int stride) {
189747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // write first 8 columns
189847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  write_buffer_8x8(output, in0, stride);
189947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  write_buffer_8x8(output + 8 * stride, in0 + 8, stride);
190047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // write second 8 columns
190147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  output += 8;
190247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  write_buffer_8x8(output, in1, stride);
190347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
190447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
190547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
190647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.orgstatic INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
190747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // perform rounding operations
190847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  right_shift_8x8(res0, 2);
190947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  right_shift_8x8(res0 + 8, 2);
191047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  right_shift_8x8(res1, 2);
191147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  right_shift_8x8(res1 + 8, 2);
191247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
191347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
1914dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.orgvoid fdct16_8col(__m128i *in) {
191547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // perform 16x16 1-D DCT for 8 columns
191647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i i[8], s[8], p[8], t[8], u[16], v[16];
191747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
191847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
191947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
192047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
192188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
192247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
192347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
192447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
192547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
192647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
192747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
192847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
192947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
193047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
193147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
193247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
193347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
193447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
193547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
193647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
193747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // stage 1
193847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  i[0] = _mm_add_epi16(in[0], in[15]);
193947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  i[1] = _mm_add_epi16(in[1], in[14]);
194047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  i[2] = _mm_add_epi16(in[2], in[13]);
194147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  i[3] = _mm_add_epi16(in[3], in[12]);
194247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  i[4] = _mm_add_epi16(in[4], in[11]);
194347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  i[5] = _mm_add_epi16(in[5], in[10]);
194447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  i[6] = _mm_add_epi16(in[6], in[9]);
194547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  i[7] = _mm_add_epi16(in[7], in[8]);
194647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
194747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[0] = _mm_sub_epi16(in[7], in[8]);
194847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[1] = _mm_sub_epi16(in[6], in[9]);
194947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[2] = _mm_sub_epi16(in[5], in[10]);
195047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[3] = _mm_sub_epi16(in[4], in[11]);
195147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[4] = _mm_sub_epi16(in[3], in[12]);
195247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[5] = _mm_sub_epi16(in[2], in[13]);
195347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[6] = _mm_sub_epi16(in[1], in[14]);
195447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[7] = _mm_sub_epi16(in[0], in[15]);
195547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
195647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  p[0] = _mm_add_epi16(i[0], i[7]);
195747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  p[1] = _mm_add_epi16(i[1], i[6]);
195847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  p[2] = _mm_add_epi16(i[2], i[5]);
195947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  p[3] = _mm_add_epi16(i[3], i[4]);
196047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  p[4] = _mm_sub_epi16(i[3], i[4]);
196147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  p[5] = _mm_sub_epi16(i[2], i[5]);
196247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  p[6] = _mm_sub_epi16(i[1], i[6]);
196347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  p[7] = _mm_sub_epi16(i[0], i[7]);
196447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
196547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_add_epi16(p[0], p[3]);
196647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_add_epi16(p[1], p[2]);
196747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[2] = _mm_sub_epi16(p[1], p[2]);
196847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[3] = _mm_sub_epi16(p[0], p[3]);
196947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
197047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[0] = _mm_unpacklo_epi16(u[0], u[1]);
197147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[1] = _mm_unpackhi_epi16(u[0], u[1]);
197247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[2] = _mm_unpacklo_epi16(u[2], u[3]);
197347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[3] = _mm_unpackhi_epi16(u[2], u[3]);
197447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
197547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
197647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
197747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
197847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
197947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
198047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
198147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
198247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
198347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
198447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
198547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
198647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
198747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
198847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
198947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
199047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
199147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
199247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
199347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
199447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
199547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
199647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
199747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
199847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
199947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
200047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
200147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
200247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[0] = _mm_packs_epi32(u[0], u[1]);
200347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[4] = _mm_packs_epi32(u[4], u[5]);
200447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[8] = _mm_packs_epi32(u[2], u[3]);
200547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[12] = _mm_packs_epi32(u[6], u[7]);
200647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
200747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_unpacklo_epi16(p[5], p[6]);
200847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_unpackhi_epi16(p[5], p[6]);
200947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
201047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
201147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
201247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
201347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
201447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
201547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
201647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
201747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
201847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
201947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
202047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
202147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
202247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
202347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
202447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_packs_epi32(v[0], v[1]);
202547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_packs_epi32(v[2], v[3]);
202647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
202747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  t[0] = _mm_add_epi16(p[4], u[0]);
202847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  t[1] = _mm_sub_epi16(p[4], u[0]);
202947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  t[2] = _mm_sub_epi16(p[7], u[1]);
203047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  t[3] = _mm_add_epi16(p[7], u[1]);
203147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
203247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_unpacklo_epi16(t[0], t[3]);
203347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_unpackhi_epi16(t[0], t[3]);
203447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[2] = _mm_unpacklo_epi16(t[1], t[2]);
203547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[3] = _mm_unpackhi_epi16(t[1], t[2]);
203647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
203747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
203847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
203947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
204047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
204147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
204247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
204347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
204447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
204547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
204647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
204747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
204847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
204947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
205047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
205147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
205247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
205347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
205447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
205547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
205647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
205747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
205847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
205947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
206047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
206147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
206247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
206347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
206447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[2] = _mm_packs_epi32(v[0], v[1]);
206547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[6] = _mm_packs_epi32(v[4], v[5]);
206647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[10] = _mm_packs_epi32(v[2], v[3]);
206747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[14] = _mm_packs_epi32(v[6], v[7]);
206847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
206947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // stage 2
207047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_unpacklo_epi16(s[2], s[5]);
207147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_unpackhi_epi16(s[2], s[5]);
207247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[2] = _mm_unpacklo_epi16(s[3], s[4]);
207347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[3] = _mm_unpackhi_epi16(s[3], s[4]);
207447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
207547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
207647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
207747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
207847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
207947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
208047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
208147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
208247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
208347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
208447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
208547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
208647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
208747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
208847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
208947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
209047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
209147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
209247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
209347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
209447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
209547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
209647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
209747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
209847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
209947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
210047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
210147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
210247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  t[2] = _mm_packs_epi32(v[0], v[1]);
210347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  t[3] = _mm_packs_epi32(v[2], v[3]);
210447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  t[4] = _mm_packs_epi32(v[4], v[5]);
210547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  t[5] = _mm_packs_epi32(v[6], v[7]);
210647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
210747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // stage 3
210847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  p[0] = _mm_add_epi16(s[0], t[3]);
210947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  p[1] = _mm_add_epi16(s[1], t[2]);
211047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  p[2] = _mm_sub_epi16(s[1], t[2]);
211147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  p[3] = _mm_sub_epi16(s[0], t[3]);
211247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  p[4] = _mm_sub_epi16(s[7], t[4]);
211347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  p[5] = _mm_sub_epi16(s[6], t[5]);
211447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  p[6] = _mm_add_epi16(s[6], t[5]);
211547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  p[7] = _mm_add_epi16(s[7], t[4]);
211647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
211747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // stage 4
211847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_unpacklo_epi16(p[1], p[6]);
211947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_unpackhi_epi16(p[1], p[6]);
212047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[2] = _mm_unpacklo_epi16(p[2], p[5]);
212147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[3] = _mm_unpackhi_epi16(p[2], p[5]);
212247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
212347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
212447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
212588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08);
212688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08);
212788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24);
212888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24);
212947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
213047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
213147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
213247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
213347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
213447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
213547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
213647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
213747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
213847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
213947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
214047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
214147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
214247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
214347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
214447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
214547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
214647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
214747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
214847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
214947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
215047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  t[1] = _mm_packs_epi32(v[0], v[1]);
215147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  t[2] = _mm_packs_epi32(v[2], v[3]);
215247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  t[5] = _mm_packs_epi32(v[4], v[5]);
215347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  t[6] = _mm_packs_epi32(v[6], v[7]);
215447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
215547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // stage 5
215647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[0] = _mm_add_epi16(p[0], t[1]);
215747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[1] = _mm_sub_epi16(p[0], t[1]);
215888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  s[2] = _mm_add_epi16(p[3], t[2]);
215988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  s[3] = _mm_sub_epi16(p[3], t[2]);
216088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  s[4] = _mm_sub_epi16(p[4], t[5]);
216188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  s[5] = _mm_add_epi16(p[4], t[5]);
216247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[6] = _mm_sub_epi16(p[7], t[6]);
216347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[7] = _mm_add_epi16(p[7], t[6]);
216447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
216547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // stage 6
216647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_unpacklo_epi16(s[0], s[7]);
216747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_unpackhi_epi16(s[0], s[7]);
216847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[2] = _mm_unpacklo_epi16(s[1], s[6]);
216947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[3] = _mm_unpackhi_epi16(s[1], s[6]);
217047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[4] = _mm_unpacklo_epi16(s[2], s[5]);
217147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[5] = _mm_unpackhi_epi16(s[2], s[5]);
217247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[6] = _mm_unpacklo_epi16(s[3], s[4]);
217347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[7] = _mm_unpackhi_epi16(s[3], s[4]);
217447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
217547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
217647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
217747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
217847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
217947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
218047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
218147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
218247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
218347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
218447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
218547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
218647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
218747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
218847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
218947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
219047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
219147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
219247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
219347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
219447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
219547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
219647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
219747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
219847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
219947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
220047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
220147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
220247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
220347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
220447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
220547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
220647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
220747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
220847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
220947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
221047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
221147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
221247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
221347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
221447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
221547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
221647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
221747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
221847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
221947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
222047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
222147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
222247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
222347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
222447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
222547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
222647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[1]  = _mm_packs_epi32(v[0], v[1]);
222747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[9]  = _mm_packs_epi32(v[2], v[3]);
222847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[5]  = _mm_packs_epi32(v[4], v[5]);
222947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[13] = _mm_packs_epi32(v[6], v[7]);
223047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[3]  = _mm_packs_epi32(v[8], v[9]);
223147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[11] = _mm_packs_epi32(v[10], v[11]);
223247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[7]  = _mm_packs_epi32(v[12], v[13]);
223347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[15] = _mm_packs_epi32(v[14], v[15]);
223447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
223547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
2236dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.orgvoid fadst16_8col(__m128i *in) {
223747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // perform 16x16 1-D ADST for 8 columns
223847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i s[16], x[16], u[32], v[32];
223947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
224047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
224147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
224247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
224347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
224447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
224547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
224647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
224747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
224847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
224947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
225047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
225147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
225247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
225347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
225447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
225547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
225647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
225747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
225847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
225947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
226047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
226147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
226247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
226347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
226447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
226547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
226647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
226747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
226847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
226947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i kZero = _mm_set1_epi16(0);
227047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
227147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
227247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
227347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
227447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
227547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
227647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
227747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
227847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
227947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
228047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
228147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
228247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
228347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
228447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
228547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
228647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
228747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
228847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
228947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
229047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
229147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
229247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
229347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
229447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
229547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
229647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
229747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
229847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
229947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
230047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
230147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
230247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
230347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
230447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
230547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
230647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
230747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
230847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
230947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
231047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
231147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
231247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
231347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
231447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
231547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
231647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
231747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
231847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
231947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
232047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
232147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_add_epi32(v[0], v[16]);
232247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_add_epi32(v[1], v[17]);
232347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[2] = _mm_add_epi32(v[2], v[18]);
232447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[3] = _mm_add_epi32(v[3], v[19]);
232547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[4] = _mm_add_epi32(v[4], v[20]);
232647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[5] = _mm_add_epi32(v[5], v[21]);
232747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[6] = _mm_add_epi32(v[6], v[22]);
232847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[7] = _mm_add_epi32(v[7], v[23]);
232947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[8] = _mm_add_epi32(v[8], v[24]);
233047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[9] = _mm_add_epi32(v[9], v[25]);
233147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[10] = _mm_add_epi32(v[10], v[26]);
233247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[11] = _mm_add_epi32(v[11], v[27]);
233347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[12] = _mm_add_epi32(v[12], v[28]);
233447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[13] = _mm_add_epi32(v[13], v[29]);
233547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[14] = _mm_add_epi32(v[14], v[30]);
233647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[15] = _mm_add_epi32(v[15], v[31]);
233747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[16] = _mm_sub_epi32(v[0], v[16]);
233847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[17] = _mm_sub_epi32(v[1], v[17]);
233947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[18] = _mm_sub_epi32(v[2], v[18]);
234047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[19] = _mm_sub_epi32(v[3], v[19]);
234147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[20] = _mm_sub_epi32(v[4], v[20]);
234247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[21] = _mm_sub_epi32(v[5], v[21]);
234347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[22] = _mm_sub_epi32(v[6], v[22]);
234447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[23] = _mm_sub_epi32(v[7], v[23]);
234547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[24] = _mm_sub_epi32(v[8], v[24]);
234647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[25] = _mm_sub_epi32(v[9], v[25]);
234747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[26] = _mm_sub_epi32(v[10], v[26]);
234847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[27] = _mm_sub_epi32(v[11], v[27]);
234947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[28] = _mm_sub_epi32(v[12], v[28]);
235047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[29] = _mm_sub_epi32(v[13], v[29]);
235147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[30] = _mm_sub_epi32(v[14], v[30]);
235247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[31] = _mm_sub_epi32(v[15], v[31]);
235347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
235447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
235547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
235647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
235747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
235847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
235947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
236047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
236147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
236247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
236347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
236447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
236547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
236647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
236747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
236847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
236947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
237047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
237147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
237247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
237347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
237447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
237547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
237647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
237747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
237847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
237947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
238047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
238147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
238247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
238347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
238447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
238547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
238647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
238747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
238847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
238947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
239047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
239147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
239247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
239347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
239447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
239547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
239647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
239747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
239847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
239947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
240047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
240147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
240247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
240347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
240447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
240547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
240647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
240747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
240847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
240947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
241047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
241147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
241247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
241347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
241447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
241547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
241647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
241747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
241847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
241947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
242047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[0] = _mm_packs_epi32(u[0], u[1]);
242147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[1] = _mm_packs_epi32(u[2], u[3]);
242247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[2] = _mm_packs_epi32(u[4], u[5]);
242347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[3] = _mm_packs_epi32(u[6], u[7]);
242447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[4] = _mm_packs_epi32(u[8], u[9]);
242547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[5] = _mm_packs_epi32(u[10], u[11]);
242647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[6] = _mm_packs_epi32(u[12], u[13]);
242747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[7] = _mm_packs_epi32(u[14], u[15]);
242847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[8] = _mm_packs_epi32(u[16], u[17]);
242947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[9] = _mm_packs_epi32(u[18], u[19]);
243047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[10] = _mm_packs_epi32(u[20], u[21]);
243147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[11] = _mm_packs_epi32(u[22], u[23]);
243247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[12] = _mm_packs_epi32(u[24], u[25]);
243347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[13] = _mm_packs_epi32(u[26], u[27]);
243447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[14] = _mm_packs_epi32(u[28], u[29]);
243547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[15] = _mm_packs_epi32(u[30], u[31]);
243647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
243747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // stage 2
243847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
243947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
244047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
244147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
244247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
244347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
244447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
244547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
244647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
244747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
244847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
244947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
245047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
245147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
245247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
245347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
245447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
245547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
245647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
245747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
245847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
245947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
246047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
246147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
246247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
246347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
246447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_add_epi32(v[0], v[8]);
246547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_add_epi32(v[1], v[9]);
246647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[2] = _mm_add_epi32(v[2], v[10]);
246747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[3] = _mm_add_epi32(v[3], v[11]);
246847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[4] = _mm_add_epi32(v[4], v[12]);
246947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[5] = _mm_add_epi32(v[5], v[13]);
247047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[6] = _mm_add_epi32(v[6], v[14]);
247147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[7] = _mm_add_epi32(v[7], v[15]);
247247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[8] = _mm_sub_epi32(v[0], v[8]);
247347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[9] = _mm_sub_epi32(v[1], v[9]);
247447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[10] = _mm_sub_epi32(v[2], v[10]);
247547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[11] = _mm_sub_epi32(v[3], v[11]);
247647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[12] = _mm_sub_epi32(v[4], v[12]);
247747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[13] = _mm_sub_epi32(v[5], v[13]);
247847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[14] = _mm_sub_epi32(v[6], v[14]);
247947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[15] = _mm_sub_epi32(v[7], v[15]);
248047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
248147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
248247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
248347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
248447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
248547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
248647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
248747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
248847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
248947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
249047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
249147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
249247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
249347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
249447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
249547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
249647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
249747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
249847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
249947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
250047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
250147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
250247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
250347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
250447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
250547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
250647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
250747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
250847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
250947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
251047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
251147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
251247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
251347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
251447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
251547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  x[0] = _mm_add_epi16(s[0], s[4]);
251647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  x[1] = _mm_add_epi16(s[1], s[5]);
251747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  x[2] = _mm_add_epi16(s[2], s[6]);
251847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  x[3] = _mm_add_epi16(s[3], s[7]);
251947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  x[4] = _mm_sub_epi16(s[0], s[4]);
252047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  x[5] = _mm_sub_epi16(s[1], s[5]);
252147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  x[6] = _mm_sub_epi16(s[2], s[6]);
252247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  x[7] = _mm_sub_epi16(s[3], s[7]);
252347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  x[8] = _mm_packs_epi32(u[0], u[1]);
252447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  x[9] = _mm_packs_epi32(u[2], u[3]);
252547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  x[10] = _mm_packs_epi32(u[4], u[5]);
252647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  x[11] = _mm_packs_epi32(u[6], u[7]);
252747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  x[12] = _mm_packs_epi32(u[8], u[9]);
252847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  x[13] = _mm_packs_epi32(u[10], u[11]);
252947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  x[14] = _mm_packs_epi32(u[12], u[13]);
253047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  x[15] = _mm_packs_epi32(u[14], u[15]);
253147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
253247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // stage 3
253347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
253447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
253547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
253647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
253747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
253847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
253947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
254047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
254147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
254247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
254347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
254447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
254547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
254647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
254747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
254847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
254947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
255047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
255147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
255247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
255347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
255447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
255547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
255647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
255747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
255847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
255947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_add_epi32(v[0], v[4]);
256047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_add_epi32(v[1], v[5]);
256147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[2] = _mm_add_epi32(v[2], v[6]);
256247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[3] = _mm_add_epi32(v[3], v[7]);
256347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[4] = _mm_sub_epi32(v[0], v[4]);
256447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[5] = _mm_sub_epi32(v[1], v[5]);
256547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[6] = _mm_sub_epi32(v[2], v[6]);
256647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[7] = _mm_sub_epi32(v[3], v[7]);
256747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[8] = _mm_add_epi32(v[8], v[12]);
256847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[9] = _mm_add_epi32(v[9], v[13]);
256947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[10] = _mm_add_epi32(v[10], v[14]);
257047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[11] = _mm_add_epi32(v[11], v[15]);
257147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[12] = _mm_sub_epi32(v[8], v[12]);
257247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[13] = _mm_sub_epi32(v[9], v[13]);
257347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[14] = _mm_sub_epi32(v[10], v[14]);
257447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[15] = _mm_sub_epi32(v[11], v[15]);
257547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
257647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
257747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
257847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
257947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
258047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
258147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
258247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
258347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
258447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
258547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
258647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
258747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
258847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
258947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
259047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
259147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
259247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
259347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
259447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
259547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
259647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
259747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
259847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
259947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
260047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
260147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
260247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
260347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
260447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
260547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
260647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
260747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
260847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
260947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
261047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[0] = _mm_add_epi16(x[0], x[2]);
261147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[1] = _mm_add_epi16(x[1], x[3]);
261247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[2] = _mm_sub_epi16(x[0], x[2]);
261347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[3] = _mm_sub_epi16(x[1], x[3]);
261447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[4] = _mm_packs_epi32(v[0], v[1]);
261547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[5] = _mm_packs_epi32(v[2], v[3]);
261647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[6] = _mm_packs_epi32(v[4], v[5]);
261747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[7] = _mm_packs_epi32(v[6], v[7]);
261847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[8] = _mm_add_epi16(x[8], x[10]);
261947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[9] = _mm_add_epi16(x[9], x[11]);
262047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[10] = _mm_sub_epi16(x[8], x[10]);
262147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[11] = _mm_sub_epi16(x[9], x[11]);
262247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[12] = _mm_packs_epi32(v[8], v[9]);
262347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[13] = _mm_packs_epi32(v[10], v[11]);
262447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[14] = _mm_packs_epi32(v[12], v[13]);
262547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  s[15] = _mm_packs_epi32(v[14], v[15]);
262647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
262747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  // stage 4
262847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
262947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
263047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
263147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
263247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
263347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
263447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
263547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
263647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
263747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
263847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
263947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
264047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
264147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
264247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
264347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
264447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
264547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
264647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
264747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
264847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
264947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
265047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
265147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
265247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
265347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
265447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
265547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
265647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
265747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
265847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
265947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
266047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
266147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
266247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
266347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
266447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
266547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
266647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
266747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
266847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
266947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
267047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
267147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
267247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
267347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
267447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
267547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
267647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
267747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
267847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
267947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
268047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
268147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
268247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
268347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
268447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
268547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
268647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
268747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
268847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[0] = s[0];
268947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[1] = _mm_sub_epi16(kZero, s[8]);
269047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[2] = s[12];
269147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[3] = _mm_sub_epi16(kZero, s[4]);
269247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[4] = _mm_packs_epi32(v[4], v[5]);
269347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[5] = _mm_packs_epi32(v[12], v[13]);
269447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[6] = _mm_packs_epi32(v[8], v[9]);
269547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[7] = _mm_packs_epi32(v[0], v[1]);
269647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[8] = _mm_packs_epi32(v[2], v[3]);
269747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[9] = _mm_packs_epi32(v[10], v[11]);
269847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[10] = _mm_packs_epi32(v[14], v[15]);
269947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[11] = _mm_packs_epi32(v[6], v[7]);
270047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[12] = s[5];
270147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[13] = _mm_sub_epi16(kZero, s[13]);
270247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[14] = s[9];
270347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  in[15] = _mm_sub_epi16(kZero, s[1]);
270447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
270547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
2706dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.orgvoid fdct16_sse2(__m128i *in0, __m128i *in1) {
2707dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org  fdct16_8col(in0);
2708dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org  fdct16_8col(in1);
270947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  array_transpose_16x16(in0, in1);
271047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
271147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
2712dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.orgvoid fadst16_sse2(__m128i *in0, __m128i *in1) {
2713dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org  fadst16_8col(in0);
2714dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org  fadst16_8col(in1);
271547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  array_transpose_16x16(in0, in1);
271647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
271747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
271876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.orgvoid vp9_fht16x16_sse2(const int16_t *input, int16_t *output,
271976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                       int stride, int tx_type) {
272047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i in0[16], in1[16];
272176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
272247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  switch (tx_type) {
272376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    case DCT_DCT:
272476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      vp9_fdct16x16_sse2(input, output, stride);
272547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org      break;
272676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    case ADST_DCT:
272776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      load_buffer_16x16(input, in0, in1, stride);
2728dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      fadst16_sse2(in0, in1);
272947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org      right_shift_16x16(in0, in1);
2730dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      fdct16_sse2(in0, in1);
273176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      write_buffer_16x16(output, in0, in1, 16);
273247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org      break;
273376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    case DCT_ADST:
273476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      load_buffer_16x16(input, in0, in1, stride);
2735dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      fdct16_sse2(in0, in1);
273647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org      right_shift_16x16(in0, in1);
2737dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      fadst16_sse2(in0, in1);
273876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      write_buffer_16x16(output, in0, in1, 16);
273947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org      break;
274076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    case ADST_ADST:
274176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      load_buffer_16x16(input, in0, in1, stride);
2742dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      fadst16_sse2(in0, in1);
274347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org      right_shift_16x16(in0, in1);
2744dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      fadst16_sse2(in0, in1);
274576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org      write_buffer_16x16(output, in0, in1, 16);
274647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org      break;
274747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    default:
274847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org      assert(0);
274947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org      break;
275047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  }
275147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
275247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
275388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.orgvoid vp9_fdct32x32_1_sse2(const int16_t *input, int16_t *output, int stride) {
275488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  __m128i in0, in1, in2, in3;
275588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  __m128i u0, u1;
275688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  __m128i sum = _mm_setzero_si128();
275788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  int i;
275888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
275988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  for (i = 0; i < 8; ++i) {
276088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in0  = _mm_load_si128((const __m128i *)(input +  0));
276188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in1  = _mm_load_si128((const __m128i *)(input +  8));
276288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in2  = _mm_load_si128((const __m128i *)(input + 16));
276388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in3  = _mm_load_si128((const __m128i *)(input + 24));
276488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
276588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    input += stride;
276688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    u0 = _mm_add_epi16(in0, in1);
276788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    u1 = _mm_add_epi16(in2, in3);
276888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    sum = _mm_add_epi16(sum, u0);
276988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
277088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in0  = _mm_load_si128((const __m128i *)(input +  0));
277188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in1  = _mm_load_si128((const __m128i *)(input +  8));
277288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in2  = _mm_load_si128((const __m128i *)(input + 16));
277388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in3  = _mm_load_si128((const __m128i *)(input + 24));
277488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
277588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    input += stride;
277688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    sum = _mm_add_epi16(sum, u1);
277788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    u0  = _mm_add_epi16(in0, in1);
277888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    u1  = _mm_add_epi16(in2, in3);
277988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    sum = _mm_add_epi16(sum, u0);
278088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
278188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in0  = _mm_load_si128((const __m128i *)(input +  0));
278288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in1  = _mm_load_si128((const __m128i *)(input +  8));
278388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in2  = _mm_load_si128((const __m128i *)(input + 16));
278488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in3  = _mm_load_si128((const __m128i *)(input + 24));
278588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
278688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    input += stride;
278788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    sum = _mm_add_epi16(sum, u1);
278888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    u0  = _mm_add_epi16(in0, in1);
278988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    u1  = _mm_add_epi16(in2, in3);
279088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    sum = _mm_add_epi16(sum, u0);
279188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
279288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in0  = _mm_load_si128((const __m128i *)(input +  0));
279388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in1  = _mm_load_si128((const __m128i *)(input +  8));
279488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in2  = _mm_load_si128((const __m128i *)(input + 16));
279588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    in3  = _mm_load_si128((const __m128i *)(input + 24));
279688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
279788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    input += stride;
279888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    sum = _mm_add_epi16(sum, u1);
279988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    u0  = _mm_add_epi16(in0, in1);
280088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    u1  = _mm_add_epi16(in2, in3);
280188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    sum = _mm_add_epi16(sum, u0);
280288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
280388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    sum = _mm_add_epi16(sum, u1);
280488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  }
280588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
280688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  u0  = _mm_setzero_si128();
280788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_unpacklo_epi16(u0, sum);
280888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1 = _mm_unpackhi_epi16(u0, sum);
280988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_srai_epi32(in0, 16);
281088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1 = _mm_srai_epi32(in1, 16);
281188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
281288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  sum = _mm_add_epi32(in0, in1);
281388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_unpacklo_epi32(sum, u0);
281488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1 = _mm_unpackhi_epi32(sum, u0);
281588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
281688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  sum = _mm_add_epi32(in0, in1);
281788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in0 = _mm_srli_si128(sum, 8);
281888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
281988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1 = _mm_add_epi32(sum, in0);
282088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  in1 = _mm_srai_epi32(in1, 3);
282188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  _mm_store_si128((__m128i *)(output), in1);
282288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org}
282388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
2824ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org#define FDCT32x32_2D vp9_fdct32x32_rd_sse2
282553a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org#define FDCT32x32_HIGH_PRECISION 0
282653a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org#include "vp9/encoder/x86/vp9_dct32x32_sse2.c"
282753a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org#undef  FDCT32x32_HIGH_PRECISION
2828ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org#undef  FDCT32x32_2D
282953a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org
2830ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org#define FDCT32x32_2D vp9_fdct32x32_sse2
283153a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org#define FDCT32x32_HIGH_PRECISION 1
283253a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org#include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT
283353a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org#undef  FDCT32x32_HIGH_PRECISION
2834ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org#undef  FDCT32x32_2D
2835