13f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org/* 23f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 33f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org * 43f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org * Use of this source code is governed by a BSD-style license 53f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org * that can be found in the LICENSE file in the root of the source 63f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org * tree. An additional intellectual property rights grant can be found 73f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org * in the file PATENTS. All contributing project authors may 83f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org * be found in the AUTHORS file in the root of the source tree. 93f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org */ 103f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org 113f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org#include <emmintrin.h> // SSE2 123f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org#include "vp9/common/vp9_idct.h" // for cospi constants 1347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org#include "vpx_ports/mem.h" 143f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org 15d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org#include "vp9/common/x86/vp9_idct_intrin_sse2.h" 16d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org 1788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.orgvoid vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride) { 1888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org __m128i in0, in1; 1988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org __m128i tmp; 2088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org const __m128i zero = _mm_setzero_si128(); 2188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); 2288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); 2388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in1 = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *) 2488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org (input + 2 * stride))); 2588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *) 2688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org (input + 3 * stride))); 2788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 2888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org tmp = _mm_add_epi16(in0, in1); 2988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in0 = _mm_unpacklo_epi16(zero, tmp); 3088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in1 = _mm_unpackhi_epi16(zero, tmp); 3188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in0 = _mm_srai_epi32(in0, 16); 3288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in1 = _mm_srai_epi32(in1, 16); 3388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 3488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org tmp = _mm_add_epi32(in0, in1); 3588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in0 = _mm_unpacklo_epi32(tmp, zero); 3688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in1 = _mm_unpackhi_epi32(tmp, zero); 3788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 3888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org tmp = _mm_add_epi32(in0, in1); 3988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in0 = _mm_srli_si128(tmp, 8); 4088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 4188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in1 = _mm_add_epi32(tmp, in0); 4288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in0 = _mm_slli_epi32(in1, 1); 4388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org _mm_store_si128((__m128i *)(output), in0); 4488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org} 4588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 46ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgvoid vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) { 4793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // This 2D transform implements 4 vertical 1D transforms followed 4893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // by 4 horizontal 1D transforms. The multiplies and adds are as given 4993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // by Chen, Smith and Fralick ('77). The commands for moving the data 5093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // around have been minimized by hand. 5193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // For the purposes of the comments, the 16 inputs are referred to at i0 5293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // through iF (in raster order), intermediate variables are a0, b0, c0 5393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // through f, and correspond to the in-place computations mapped to input 5493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // locations. The outputs, o0 through oF are labeled according to the 5593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // output locations. 5693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org 573f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Constants 5893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // These are the coefficients used for the multiplies. 5993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64), 6093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // where cospi_N_64 = cos(N pi /64) 6193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i k__cospi_A = _mm_setr_epi16(cospi_16_64, cospi_16_64, 6293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org cospi_16_64, cospi_16_64, 6393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org cospi_16_64, -cospi_16_64, 6493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org cospi_16_64, -cospi_16_64); 6593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i k__cospi_B = _mm_setr_epi16(cospi_16_64, -cospi_16_64, 6693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org cospi_16_64, -cospi_16_64, 6793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org cospi_16_64, cospi_16_64, 6893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org cospi_16_64, cospi_16_64); 6993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i k__cospi_C = _mm_setr_epi16(cospi_8_64, cospi_24_64, 7093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org cospi_8_64, cospi_24_64, 7193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org cospi_24_64, -cospi_8_64, 7293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org cospi_24_64, -cospi_8_64); 7393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i k__cospi_D = _mm_setr_epi16(cospi_24_64, -cospi_8_64, 7493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org cospi_24_64, -cospi_8_64, 7593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org cospi_8_64, cospi_24_64, 7693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org cospi_8_64, cospi_24_64); 7793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i k__cospi_E = _mm_setr_epi16(cospi_16_64, cospi_16_64, 7893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org cospi_16_64, cospi_16_64, 7993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org cospi_16_64, cospi_16_64, 8093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org cospi_16_64, cospi_16_64); 8193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i k__cospi_F = _mm_setr_epi16(cospi_16_64, -cospi_16_64, 8293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org cospi_16_64, -cospi_16_64, 8393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org cospi_16_64, -cospi_16_64, 8493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org cospi_16_64, -cospi_16_64); 8593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i k__cospi_G = _mm_setr_epi16(cospi_8_64, cospi_24_64, 8693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org cospi_8_64, cospi_24_64, 8793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org -cospi_8_64, -cospi_24_64, 8893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org -cospi_8_64, -cospi_24_64); 8993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i k__cospi_H = _mm_setr_epi16(cospi_24_64, -cospi_8_64, 9093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org cospi_24_64, -cospi_8_64, 9193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org -cospi_24_64, cospi_8_64, 9293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org -cospi_24_64, cospi_8_64); 9393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org 943f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 9593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // This second rounding constant saves doing some extra adds at the end 9693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i k__DCT_CONST_ROUNDING2 = _mm_set1_epi32(DCT_CONST_ROUNDING 9793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org +(DCT_CONST_ROUNDING << 1)); 9893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const int DCT_CONST_BITS2 = DCT_CONST_BITS+2; 993f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); 1003f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); 101d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org __m128i in0, in1; 10293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org 1033f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Load inputs. 1043f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org { 1053f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); 10693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); 10793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org in1 = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *) 10893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org (input + 2 * stride))); 109d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *) 11093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org (input + 3 * stride))); 11193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // in0 = [i0 i1 i2 i3 iC iD iE iF] 11293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // in1 = [i4 i5 i6 i7 i8 i9 iA iB] 113d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 11493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org 11593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // multiply by 16 to give some extra precision 1163f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in0 = _mm_slli_epi16(in0, 4); 1173f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in1 = _mm_slli_epi16(in1, 4); 1183f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // if (i == 0 && input[0]) input[0] += 1; 11993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // add 1 to the upper left pixel if it is non-zero, which helps reduce 12093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // the round-trip error 1213f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org { 122411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // The mask will only contain whether the first value is zero, all 1233f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // other comparison will fail as something shifted by 4 (above << 4) 1243f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // can never be equal to one. To increment in the non-zero case, we 1253f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // add the mask and one for the first element: 1263f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // - if zero, mask = -1, v = v - 1 + 1 = v 1273f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 1283f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a); 1293f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in0 = _mm_add_epi16(in0, mask); 1303f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in0 = _mm_add_epi16(in0, k__nonzero_bias_b); 1313f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org } 1323f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org } 13393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // There are 4 total stages, alternating between an add/subtract stage 13493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // followed by an multiply-and-add stage. 13593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org { 13693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // Stage 1: Add/subtract 13793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org 13893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // in0 = [i0 i1 i2 i3 iC iD iE iF] 13993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // in1 = [i4 i5 i6 i7 i8 i9 iA iB] 14093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i r0 = _mm_unpacklo_epi16(in0, in1); 14193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i r1 = _mm_unpackhi_epi16(in0, in1); 14293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // r0 = [i0 i4 i1 i5 i2 i6 i3 i7] 14393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // r1 = [iC i8 iD i9 iE iA iF iB] 14493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4); 14593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4); 14693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // r2 = [i0 i4 i1 i5 i3 i7 i2 i6] 14793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // r3 = [iC i8 iD i9 iF iB iE iA] 14893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org 14993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i t0 = _mm_add_epi16(r2, r3); 15093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i t1 = _mm_sub_epi16(r2, r3); 15193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // t0 = [a0 a4 a1 a5 a3 a7 a2 a6] 15293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // t1 = [aC a8 aD a9 aF aB aE aA] 15393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org 15493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // Stage 2: multiply by constants (which gets us into 32 bits). 15593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // The constants needed here are: 15693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16] 15793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16] 15893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08] 15993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24] 16093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A); 16193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B); 16293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C); 16393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D); 16493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // Then add and right-shift to get back to 16-bit range 1653f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 16693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1673f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 16893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1693f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 17093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1713f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 17293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 17393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // w0 = [b0 b1 b7 b6] 17493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // w1 = [b8 b9 bF bE] 17593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // w2 = [b4 b5 b3 b2] 17693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // w3 = [bC bD bB bA] 17793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i x0 = _mm_packs_epi32(w0, w1); 17893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i x1 = _mm_packs_epi32(w2, w3); 17993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // x0 = [b0 b1 b7 b6 b8 b9 bF bE] 18093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // x1 = [b4 b5 b3 b2 bC bD bB bA] 18193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org in0 = _mm_shuffle_epi32(x0, 0xD8); 18293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org in1 = _mm_shuffle_epi32(x1, 0x8D); 18393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // in0 = [b0 b1 b8 b9 b7 b6 bF bE] 18493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // in1 = [b3 b2 bB bA b4 b5 bC bD] 185d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org } 186d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org { 18793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // vertical DCTs finished. Now we do the horizontal DCTs. 18893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // Stage 3: Add/subtract 18993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org 19093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i t0 = _mm_add_epi16(in0, in1); 19193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i t1 = _mm_sub_epi16(in0, in1); 19293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // t0 = [c0 c1 c8 c9 c4 c5 cC cD] 19393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE] 19493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org 19593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // Stage 4: multiply by constants (which gets us into 32 bits). 19693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // The constants needed here are: 19793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16] 19893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16] 19993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24] 20093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08] 20193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E); 20293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F); 20393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G); 20493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H); 20593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // Then add and right-shift to get back to 16-bit range 20693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // but this combines the final right-shift as well to save operations 20793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // This unusual rounding operations is to maintain bit-accurate 20893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // compatibility with the c version of this function which has two 20993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // rounding steps in a row. 21093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2); 21193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2); 21293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2); 21393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2); 21493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2); 21593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2); 21693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2); 21793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2); 21893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // w0 = [o0 o4 o8 oC] 21993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // w1 = [o2 o6 oA oE] 22093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // w2 = [o1 o5 o9 oD] 22193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // w3 = [o3 o7 oB oF] 22293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // remember the o's are numbered according to the correct output location 22393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i x0 = _mm_packs_epi32(w0, w1); 22493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i x1 = _mm_packs_epi32(w2, w3); 22593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // x0 = [o0 o4 o8 oC o2 o6 oA oE] 22693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // x1 = [o1 o5 o9 oD o3 o7 oB oF] 22793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i y0 = _mm_unpacklo_epi16(x0, x1); 22893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org const __m128i y1 = _mm_unpackhi_epi16(x0, x1); 22993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // y0 = [o0 o1 o4 o5 o8 o9 oC oD] 23093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // y1 = [o2 o3 o6 o7 oA oB oE oF] 23193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org in0 = _mm_unpacklo_epi32(y0, y1); 23293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // in0 = [o0 o1 o2 o3 o4 o5 o6 o7] 23393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org in1 = _mm_unpackhi_epi32(y0, y1); 23493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // in1 = [o8 o9 oA oB oC oD oE oF] 23593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org } 23693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // Post-condition (v + 1) >> 2 is now incorporated into previous 23793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // add and right-shift commands. Only 2 store instructions needed 23893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org // because we are using the fact that 1/3 are stored just after 0/2. 23993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org { 24093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org _mm_storeu_si128((__m128i *)(output + 0 * 4), in0); 24193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org _mm_storeu_si128((__m128i *)(output + 2 * 4), in1); 2423f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org } 2433f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org} 2443f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org 24593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org 246ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgstatic INLINE void load_buffer_4x4(const int16_t *input, __m128i *in, 247ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org int stride) { 24847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); 24947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); 25047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i mask; 25147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 25247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); 25347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); 25447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); 25547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); 25647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 25747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[0] = _mm_slli_epi16(in[0], 4); 25847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[1] = _mm_slli_epi16(in[1], 4); 25947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[2] = _mm_slli_epi16(in[2], 4); 26047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[3] = _mm_slli_epi16(in[3], 4); 26147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 26247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a); 26347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[0] = _mm_add_epi16(in[0], mask); 26447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b); 26547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org} 26647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 26747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.orgstatic INLINE void write_buffer_4x4(int16_t *output, __m128i *res) { 26847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i kOne = _mm_set1_epi16(1); 26947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]); 27047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]); 27147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i out01 = _mm_add_epi16(in01, kOne); 27247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i out23 = _mm_add_epi16(in23, kOne); 27347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org out01 = _mm_srai_epi16(out01, 2); 27447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org out23 = _mm_srai_epi16(out23, 2); 27547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org _mm_store_si128((__m128i *)(output + 0 * 8), out01); 27647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org _mm_store_si128((__m128i *)(output + 1 * 8), out23); 27747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org} 27847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 27947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.orgstatic INLINE void transpose_4x4(__m128i *res) { 28047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // Combine and transpose 28147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // 00 01 02 03 20 21 22 23 28247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // 10 11 12 13 30 31 32 33 28347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); 28447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); 28547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 28647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // 00 10 01 11 02 12 03 13 28747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // 20 30 21 31 22 32 23 33 28847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); 28947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); 29047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 29147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // 00 10 20 30 01 11 21 31 29247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // 02 12 22 32 03 13 23 33 29347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // only use the first 4 16-bit integers 29447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org res[1] = _mm_unpackhi_epi64(res[0], res[0]); 29547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org res[3] = _mm_unpackhi_epi64(res[2], res[2]); 29647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org} 29747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 298dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.orgvoid fdct4_sse2(__m128i *in) { 29947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 30047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 301ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 302ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 30347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 30447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 30547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i u[4], v[4]; 306ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org u[0]=_mm_unpacklo_epi16(in[0], in[1]); 307ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org u[1]=_mm_unpacklo_epi16(in[3], in[2]); 308ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org 309ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org v[0] = _mm_add_epi16(u[0], u[1]); 310ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org v[1] = _mm_sub_epi16(u[0], u[1]); 31147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 31247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0 31347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2 314ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1 315ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3 31647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 31747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 31847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 31947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 32047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 32147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 32247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 32347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 32447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 32547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 32647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[0] = _mm_packs_epi32(u[0], u[1]); 32747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[1] = _mm_packs_epi32(u[2], u[3]); 32847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org transpose_4x4(in); 32947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org} 33047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 331dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.orgvoid fadst4_sse2(__m128i *in) { 33247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9); 33347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9); 33447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9); 33547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9); 33647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9); 33747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i kZero = _mm_set1_epi16(0); 33847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 33947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i u[8], v[8]; 34047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i in7 = _mm_add_epi16(in[0], in[1]); 34147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 34247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[0] = _mm_unpacklo_epi16(in[0], in[1]); 34347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[1] = _mm_unpacklo_epi16(in[2], in[3]); 34447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[2] = _mm_unpacklo_epi16(in7, kZero); 34547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[3] = _mm_unpacklo_epi16(in[2], kZero); 346d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org u[4] = _mm_unpacklo_epi16(in[3], kZero); 34747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 34847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2 34947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5 35047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1 35147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3 35247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6 35347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4 354d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03); 35547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 35647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[0] = _mm_add_epi32(v[0], v[1]); 357d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org u[1] = _mm_sub_epi32(v[2], v[6]); 35847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[2] = _mm_add_epi32(v[3], v[4]); 35947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[3] = _mm_sub_epi32(u[2], u[0]); 36047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[4] = _mm_slli_epi32(v[5], 2); 36147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[5] = _mm_sub_epi32(u[4], v[5]); 36247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[6] = _mm_add_epi32(u[3], u[5]); 36347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 36447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 36547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 36647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 36747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 36847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 36947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 37047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 37147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 37247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 37347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 37447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[0] = _mm_packs_epi32(u[0], u[2]); 37547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[1] = _mm_packs_epi32(u[1], u[3]); 37647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org transpose_4x4(in); 37747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org} 37847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 37976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.orgvoid vp9_fht4x4_sse2(const int16_t *input, int16_t *output, 38076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org int stride, int tx_type) { 38147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i in[4]; 38276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 38347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org switch (tx_type) { 38476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org case DCT_DCT: 38576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org vp9_fdct4x4_sse2(input, output, stride); 38647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org break; 38776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org case ADST_DCT: 38876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org load_buffer_4x4(input, in, stride); 389dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org fadst4_sse2(in); 390dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org fdct4_sse2(in); 39176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org write_buffer_4x4(output, in); 39247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org break; 39376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org case DCT_ADST: 39476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org load_buffer_4x4(input, in, stride); 395dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org fdct4_sse2(in); 396dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org fadst4_sse2(in); 39776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org write_buffer_4x4(output, in); 39847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org break; 39976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org case ADST_ADST: 40076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org load_buffer_4x4(input, in, stride); 401dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org fadst4_sse2(in); 402dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org fadst4_sse2(in); 40376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org write_buffer_4x4(output, in); 40447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org break; 40576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org default: 40676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org assert(0); 40776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org break; 40847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org } 40947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org} 41047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 41188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.orgvoid vp9_fdct8x8_1_sse2(const int16_t *input, int16_t *output, int stride) { 41288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); 41388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); 41488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); 41588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); 41688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org __m128i u0, u1, sum; 41788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 41888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org u0 = _mm_add_epi16(in0, in1); 41988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org u1 = _mm_add_epi16(in2, in3); 42088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 42188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); 42288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); 42388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); 42488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); 42588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 42688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org sum = _mm_add_epi16(u0, u1); 42788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 42888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in0 = _mm_add_epi16(in0, in1); 42988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in2 = _mm_add_epi16(in2, in3); 43088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org sum = _mm_add_epi16(sum, in0); 43188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 43288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org u0 = _mm_setzero_si128(); 43388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org sum = _mm_add_epi16(sum, in2); 43488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 43588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in0 = _mm_unpacklo_epi16(u0, sum); 43688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in1 = _mm_unpackhi_epi16(u0, sum); 43788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in0 = _mm_srai_epi32(in0, 16); 43888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in1 = _mm_srai_epi32(in1, 16); 43988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 44088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org sum = _mm_add_epi32(in0, in1); 44188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in0 = _mm_unpacklo_epi32(sum, u0); 44288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in1 = _mm_unpackhi_epi32(sum, u0); 44388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 44488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org sum = _mm_add_epi32(in0, in1); 44588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in0 = _mm_srli_si128(sum, 8); 44688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 44788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in1 = _mm_add_epi32(sum, in0); 44888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org _mm_store_si128((__m128i *)(output), in1); 44988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org} 45088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 451ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgvoid vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) { 4523f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org int pass; 4533f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Constants 4543f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // When we use them, in one case, they are all the same. In all others 4553f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // it's a pair of them that we need to repeat four times. This is done 4563f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // by constructing the 32 bit constant corresponding to that pair. 4573f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 4583f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 4593f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 4603f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 4613f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 4623f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 4633f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 4643f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 4653f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 4663f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Load input 46747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); 46847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); 46947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); 47047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); 47147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); 47247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); 47347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); 47447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); 4753f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Pre-condition input (shift by two) 4763f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in0 = _mm_slli_epi16(in0, 2); 4773f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in1 = _mm_slli_epi16(in1, 2); 4783f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in2 = _mm_slli_epi16(in2, 2); 4793f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in3 = _mm_slli_epi16(in3, 2); 4803f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in4 = _mm_slli_epi16(in4, 2); 4813f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in5 = _mm_slli_epi16(in5, 2); 4823f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in6 = _mm_slli_epi16(in6, 2); 4833f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in7 = _mm_slli_epi16(in7, 2); 4843f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org 4853f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // We do two passes, first the columns, then the rows. The results of the 4863f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // first pass are transposed so that the same column code can be reused. The 4873f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // results of the second pass are also transposed so that the rows (processed 4883f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // as columns) are put back in row positions. 4893f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org for (pass = 0; pass < 2; pass++) { 4903f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // To store results of each pass before the transpose. 4913f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org __m128i res0, res1, res2, res3, res4, res5, res6, res7; 492411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // Add/subtract 4933f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i q0 = _mm_add_epi16(in0, in7); 4943f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i q1 = _mm_add_epi16(in1, in6); 4953f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i q2 = _mm_add_epi16(in2, in5); 4963f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i q3 = _mm_add_epi16(in3, in4); 4973f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i q4 = _mm_sub_epi16(in3, in4); 4983f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i q5 = _mm_sub_epi16(in2, in5); 4993f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i q6 = _mm_sub_epi16(in1, in6); 5003f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i q7 = _mm_sub_epi16(in0, in7); 5013f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Work on first four results 5023f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org { 503411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // Add/subtract 5043f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i r0 = _mm_add_epi16(q0, q3); 5053f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i r1 = _mm_add_epi16(q1, q2); 5063f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i r2 = _mm_sub_epi16(q1, q2); 5073f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i r3 = _mm_sub_epi16(q0, q3); 5083f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Interleave to do the multiply by constants which gets us into 32bits 5093f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t0 = _mm_unpacklo_epi16(r0, r1); 5103f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t1 = _mm_unpackhi_epi16(r0, r1); 5113f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t2 = _mm_unpacklo_epi16(r2, r3); 5123f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t3 = _mm_unpackhi_epi16(r2, r3); 5133f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); 5143f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); 5153f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); 5163f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); 5173f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); 5183f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); 5193f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); 5203f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); 5213f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // dct_const_round_shift 5223f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 5233f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 5243f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 5253f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 5263f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 5273f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 5283f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 5293f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 5303f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 5313f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 5323f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 5333f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 5343f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 5353f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 5363f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 5373f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 5383f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Combine 5393f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org res0 = _mm_packs_epi32(w0, w1); 5403f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org res4 = _mm_packs_epi32(w2, w3); 5413f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org res2 = _mm_packs_epi32(w4, w5); 5423f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org res6 = _mm_packs_epi32(w6, w7); 5433f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org } 5443f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Work on next four results 5453f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org { 5463f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Interleave to do the multiply by constants which gets us into 32bits 5473f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i d0 = _mm_unpacklo_epi16(q6, q5); 5483f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i d1 = _mm_unpackhi_epi16(q6, q5); 5493f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); 5503f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); 5513f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); 5523f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); 5533f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // dct_const_round_shift 5543f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); 5553f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); 5563f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); 5573f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); 5583f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); 5593f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); 5603f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); 5613f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); 5623f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Combine 5633f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i r0 = _mm_packs_epi32(s0, s1); 5643f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i r1 = _mm_packs_epi32(s2, s3); 565411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // Add/subtract 5663f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i x0 = _mm_add_epi16(q4, r0); 5673f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i x1 = _mm_sub_epi16(q4, r0); 5683f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i x2 = _mm_sub_epi16(q7, r1); 5693f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i x3 = _mm_add_epi16(q7, r1); 5703f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Interleave to do the multiply by constants which gets us into 32bits 5713f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t0 = _mm_unpacklo_epi16(x0, x3); 5723f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t1 = _mm_unpackhi_epi16(x0, x3); 5733f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t2 = _mm_unpacklo_epi16(x1, x2); 5743f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t3 = _mm_unpackhi_epi16(x1, x2); 5753f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); 5763f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); 5773f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); 5783f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); 5793f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); 5803f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); 5813f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); 5823f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); 5833f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // dct_const_round_shift 5843f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 5853f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 5863f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 5873f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 5883f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 5893f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 5903f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 5913f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 5923f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 5933f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 5943f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 5953f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 5963f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 5973f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 5983f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 5993f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 6003f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Combine 6013f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org res1 = _mm_packs_epi32(w0, w1); 6023f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org res7 = _mm_packs_epi32(w2, w3); 6033f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org res5 = _mm_packs_epi32(w4, w5); 6043f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org res3 = _mm_packs_epi32(w6, w7); 6053f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org } 6063f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Transpose the 8x8. 6073f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org { 6083f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 00 01 02 03 04 05 06 07 6093f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 10 11 12 13 14 15 16 17 6103f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 20 21 22 23 24 25 26 27 6113f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 30 31 32 33 34 35 36 37 6123f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 40 41 42 43 44 45 46 47 6133f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 50 51 52 53 54 55 56 57 6143f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 60 61 62 63 64 65 66 67 6153f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 70 71 72 73 74 75 76 77 6163f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); 6173f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); 6183f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); 6193f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); 6203f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); 6213f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); 6223f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); 6233f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); 6243f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 00 10 01 11 02 12 03 13 6253f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 20 30 21 31 22 32 23 33 6263f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 04 14 05 15 06 16 07 17 6273f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 24 34 25 35 26 36 27 37 6283f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 40 50 41 51 42 52 43 53 6293f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 60 70 61 71 62 72 63 73 6303f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 54 54 55 55 56 56 57 57 6313f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 64 74 65 75 66 76 67 77 6323f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 6333f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); 6343f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 6353f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); 6363f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); 6373f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 6383f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); 6393f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 6403f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 00 10 20 30 01 11 21 31 6413f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 40 50 60 70 41 51 61 71 6423f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 02 12 22 32 03 13 23 33 6433f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 42 52 62 72 43 53 63 73 6443f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 04 14 24 34 05 15 21 36 6453f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 44 54 64 74 45 55 61 76 6463f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 06 16 26 36 07 17 27 37 6473f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 46 56 66 76 47 57 67 77 6483f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); 6493f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); 6503f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); 6513f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); 6523f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); 6533f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); 6543f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); 6553f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); 6563f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 00 10 20 30 40 50 60 70 6573f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 01 11 21 31 41 51 61 71 6583f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 02 12 22 32 42 52 62 72 6593f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 03 13 23 33 43 53 63 73 6603f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 04 14 24 34 44 54 64 74 6613f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 05 15 25 35 45 55 65 75 6623f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 06 16 26 36 46 56 66 76 6633f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 07 17 27 37 47 57 67 77 6643f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org } 6653f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org } 6663f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Post-condition output and store it 6673f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org { 6683f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Post-condition (division by two) 6693f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // division of two 16 bits signed numbers using shifts 6703f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // n / 2 = (n - (n >> 15)) >> 1 6713f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i sign_in0 = _mm_srai_epi16(in0, 15); 6723f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i sign_in1 = _mm_srai_epi16(in1, 15); 6733f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i sign_in2 = _mm_srai_epi16(in2, 15); 6743f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i sign_in3 = _mm_srai_epi16(in3, 15); 6753f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i sign_in4 = _mm_srai_epi16(in4, 15); 6763f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i sign_in5 = _mm_srai_epi16(in5, 15); 6773f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i sign_in6 = _mm_srai_epi16(in6, 15); 6783f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i sign_in7 = _mm_srai_epi16(in7, 15); 6793f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in0 = _mm_sub_epi16(in0, sign_in0); 6803f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in1 = _mm_sub_epi16(in1, sign_in1); 6813f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in2 = _mm_sub_epi16(in2, sign_in2); 6823f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in3 = _mm_sub_epi16(in3, sign_in3); 6833f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in4 = _mm_sub_epi16(in4, sign_in4); 6843f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in5 = _mm_sub_epi16(in5, sign_in5); 6853f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in6 = _mm_sub_epi16(in6, sign_in6); 6863f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in7 = _mm_sub_epi16(in7, sign_in7); 6873f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in0 = _mm_srai_epi16(in0, 1); 6883f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in1 = _mm_srai_epi16(in1, 1); 6893f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in2 = _mm_srai_epi16(in2, 1); 6903f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in3 = _mm_srai_epi16(in3, 1); 6913f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in4 = _mm_srai_epi16(in4, 1); 6923f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in5 = _mm_srai_epi16(in5, 1); 6933f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in6 = _mm_srai_epi16(in6, 1); 6943f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in7 = _mm_srai_epi16(in7, 1); 6953f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // store results 69647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org _mm_store_si128((__m128i *)(output + 0 * 8), in0); 69747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org _mm_store_si128((__m128i *)(output + 1 * 8), in1); 69847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org _mm_store_si128((__m128i *)(output + 2 * 8), in2); 69947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org _mm_store_si128((__m128i *)(output + 3 * 8), in3); 70047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org _mm_store_si128((__m128i *)(output + 4 * 8), in4); 70147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org _mm_store_si128((__m128i *)(output + 5 * 8), in5); 70247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org _mm_store_si128((__m128i *)(output + 6 * 8), in6); 70347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org _mm_store_si128((__m128i *)(output + 7 * 8), in7); 70447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org } 70547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org} 70647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 70747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org// load 8x8 array 708ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgstatic INLINE void load_buffer_8x8(const int16_t *input, __m128i *in, 709ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org int stride) { 710ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); 711ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); 712ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); 713ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); 714ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); 715ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); 716ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); 717ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); 71847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 71947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[0] = _mm_slli_epi16(in[0], 2); 72047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[1] = _mm_slli_epi16(in[1], 2); 72147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[2] = _mm_slli_epi16(in[2], 2); 72247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[3] = _mm_slli_epi16(in[3], 2); 72347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[4] = _mm_slli_epi16(in[4], 2); 72447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[5] = _mm_slli_epi16(in[5], 2); 72547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[6] = _mm_slli_epi16(in[6], 2); 72647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[7] = _mm_slli_epi16(in[7], 2); 72747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org} 72847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 72947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org// right shift and rounding 73047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.orgstatic INLINE void right_shift_8x8(__m128i *res, int const bit) { 73147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i kOne = _mm_set1_epi16(1); 73247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const int bit_m02 = bit - 2; 73347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i sign0 = _mm_srai_epi16(res[0], 15); 73447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i sign1 = _mm_srai_epi16(res[1], 15); 73547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i sign2 = _mm_srai_epi16(res[2], 15); 73647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i sign3 = _mm_srai_epi16(res[3], 15); 73747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i sign4 = _mm_srai_epi16(res[4], 15); 73847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i sign5 = _mm_srai_epi16(res[5], 15); 73947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i sign6 = _mm_srai_epi16(res[6], 15); 74047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i sign7 = _mm_srai_epi16(res[7], 15); 74147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 74247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org if (bit_m02 >= 0) { 74347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02); 74447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org res[0] = _mm_add_epi16(res[0], k_const_rounding); 74547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org res[1] = _mm_add_epi16(res[1], k_const_rounding); 74647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org res[2] = _mm_add_epi16(res[2], k_const_rounding); 74747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org res[3] = _mm_add_epi16(res[3], k_const_rounding); 74847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org res[4] = _mm_add_epi16(res[4], k_const_rounding); 74947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org res[5] = _mm_add_epi16(res[5], k_const_rounding); 75047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org res[6] = _mm_add_epi16(res[6], k_const_rounding); 75147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org res[7] = _mm_add_epi16(res[7], k_const_rounding); 75247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org } 75347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 75447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org res[0] = _mm_sub_epi16(res[0], sign0); 75547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org res[1] = _mm_sub_epi16(res[1], sign1); 75647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org res[2] = _mm_sub_epi16(res[2], sign2); 75747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org res[3] = _mm_sub_epi16(res[3], sign3); 75847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org res[4] = _mm_sub_epi16(res[4], sign4); 75947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org res[5] = _mm_sub_epi16(res[5], sign5); 76047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org res[6] = _mm_sub_epi16(res[6], sign6); 76147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org res[7] = _mm_sub_epi16(res[7], sign7); 76247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 76347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org res[0] = _mm_srai_epi16(res[0], bit); 76447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org res[1] = _mm_srai_epi16(res[1], bit); 76547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org res[2] = _mm_srai_epi16(res[2], bit); 76647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org res[3] = _mm_srai_epi16(res[3], bit); 76747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org res[4] = _mm_srai_epi16(res[4], bit); 76847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org res[5] = _mm_srai_epi16(res[5], bit); 76947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org res[6] = _mm_srai_epi16(res[6], bit); 77047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org res[7] = _mm_srai_epi16(res[7], bit); 77147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org} 77247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 77347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org// write 8x8 array 77447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.orgstatic INLINE void write_buffer_8x8(int16_t *output, __m128i *res, int stride) { 77547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org _mm_store_si128((__m128i *)(output + 0 * stride), res[0]); 77647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org _mm_store_si128((__m128i *)(output + 1 * stride), res[1]); 77747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org _mm_store_si128((__m128i *)(output + 2 * stride), res[2]); 77847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org _mm_store_si128((__m128i *)(output + 3 * stride), res[3]); 77947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org _mm_store_si128((__m128i *)(output + 4 * stride), res[4]); 78047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org _mm_store_si128((__m128i *)(output + 5 * stride), res[5]); 78147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org _mm_store_si128((__m128i *)(output + 6 * stride), res[6]); 78247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org _mm_store_si128((__m128i *)(output + 7 * stride), res[7]); 78347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org} 78447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 785dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.orgvoid fdct8_sse2(__m128i *in) { 78647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // constants 78747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 78847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 78947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 79047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 79147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 79247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 79347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 79447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 79547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 79647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i u0, u1, u2, u3, u4, u5, u6, u7; 79747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i v0, v1, v2, v3, v4, v5, v6, v7; 79847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i s0, s1, s2, s3, s4, s5, s6, s7; 79947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 80047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // stage 1 80147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s0 = _mm_add_epi16(in[0], in[7]); 80247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s1 = _mm_add_epi16(in[1], in[6]); 80347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s2 = _mm_add_epi16(in[2], in[5]); 80447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s3 = _mm_add_epi16(in[3], in[4]); 80547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s4 = _mm_sub_epi16(in[3], in[4]); 80647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s5 = _mm_sub_epi16(in[2], in[5]); 80747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s6 = _mm_sub_epi16(in[1], in[6]); 80847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s7 = _mm_sub_epi16(in[0], in[7]); 80947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 81047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u0 = _mm_add_epi16(s0, s3); 81147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u1 = _mm_add_epi16(s1, s2); 81247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u2 = _mm_sub_epi16(s1, s2); 81347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u3 = _mm_sub_epi16(s0, s3); 81447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // interleave and perform butterfly multiplication/addition 81547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v0 = _mm_unpacklo_epi16(u0, u1); 81647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v1 = _mm_unpackhi_epi16(u0, u1); 81747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v2 = _mm_unpacklo_epi16(u2, u3); 81847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v3 = _mm_unpackhi_epi16(u2, u3); 81947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 82047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u0 = _mm_madd_epi16(v0, k__cospi_p16_p16); 82147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u1 = _mm_madd_epi16(v1, k__cospi_p16_p16); 82247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u2 = _mm_madd_epi16(v0, k__cospi_p16_m16); 82347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u3 = _mm_madd_epi16(v1, k__cospi_p16_m16); 82447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u4 = _mm_madd_epi16(v2, k__cospi_p24_p08); 82547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u5 = _mm_madd_epi16(v3, k__cospi_p24_p08); 82647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u6 = _mm_madd_epi16(v2, k__cospi_m08_p24); 82747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u7 = _mm_madd_epi16(v3, k__cospi_m08_p24); 82847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 82947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // shift and rounding 83047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 83147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 83247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 83347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 83447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 83547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 83647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 83747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 83847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 83947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 84047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 84147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 84247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 84347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 84447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 84547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 84647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 84747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 84847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[0] = _mm_packs_epi32(u0, u1); 84947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[2] = _mm_packs_epi32(u4, u5); 85047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[4] = _mm_packs_epi32(u2, u3); 85147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[6] = _mm_packs_epi32(u6, u7); 85247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 85347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // stage 2 85447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // interleave and perform butterfly multiplication/addition 85547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u0 = _mm_unpacklo_epi16(s6, s5); 85647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u1 = _mm_unpackhi_epi16(s6, s5); 85747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v0 = _mm_madd_epi16(u0, k__cospi_p16_m16); 85847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v1 = _mm_madd_epi16(u1, k__cospi_p16_m16); 85947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v2 = _mm_madd_epi16(u0, k__cospi_p16_p16); 86047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v3 = _mm_madd_epi16(u1, k__cospi_p16_p16); 86147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 86247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // shift and rounding 86347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 86447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 86547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 86647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 86747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 86847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 86947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 87047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 87147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 87247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 87347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u0 = _mm_packs_epi32(v0, v1); 87447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u1 = _mm_packs_epi32(v2, v3); 87547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 87647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // stage 3 87747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s0 = _mm_add_epi16(s4, u0); 87847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s1 = _mm_sub_epi16(s4, u0); 87947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s2 = _mm_sub_epi16(s7, u1); 88047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s3 = _mm_add_epi16(s7, u1); 88147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 88247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // stage 4 88347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u0 = _mm_unpacklo_epi16(s0, s3); 88447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u1 = _mm_unpackhi_epi16(s0, s3); 88547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u2 = _mm_unpacklo_epi16(s1, s2); 88647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u3 = _mm_unpackhi_epi16(s1, s2); 88747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 88847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v0 = _mm_madd_epi16(u0, k__cospi_p28_p04); 88947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v1 = _mm_madd_epi16(u1, k__cospi_p28_p04); 89047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v2 = _mm_madd_epi16(u2, k__cospi_p12_p20); 89147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v3 = _mm_madd_epi16(u3, k__cospi_p12_p20); 89247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v4 = _mm_madd_epi16(u2, k__cospi_m20_p12); 89347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v5 = _mm_madd_epi16(u3, k__cospi_m20_p12); 89447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v6 = _mm_madd_epi16(u0, k__cospi_m04_p28); 89547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v7 = _mm_madd_epi16(u1, k__cospi_m04_p28); 89647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 89747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // shift and rounding 89847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 89947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 90047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 90147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 90247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); 90347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); 90447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); 90547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); 90647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 90747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 90847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 90947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 91047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 91147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); 91247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); 91347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); 91447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); 91547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 91647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[1] = _mm_packs_epi32(v0, v1); 91747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[3] = _mm_packs_epi32(v4, v5); 91847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[5] = _mm_packs_epi32(v2, v3); 91947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[7] = _mm_packs_epi32(v6, v7); 92047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 92147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // transpose 92247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org array_transpose_8x8(in, in); 92347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org} 92447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 925dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.orgvoid fadst8_sse2(__m128i *in) { 92647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // Constants 92747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); 92847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); 92947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); 93047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); 93147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); 93247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); 93347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); 93447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); 93547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 93647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 93747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 93847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 93947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 94047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__const_0 = _mm_set1_epi16(0); 94147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 94247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 94347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; 94447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; 94547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; 94647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i s0, s1, s2, s3, s4, s5, s6, s7; 94747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i in0, in1, in2, in3, in4, in5, in6, in7; 94847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 94947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // properly aligned for butterfly input 95047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in0 = in[7]; 95147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in1 = in[0]; 95247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in2 = in[5]; 95347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in3 = in[2]; 95447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in4 = in[3]; 95547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in5 = in[4]; 95647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in6 = in[1]; 95747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in7 = in[6]; 95847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 95947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // column transformation 96047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // stage 1 96147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // interleave and multiply/add into 32-bit integer 96247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s0 = _mm_unpacklo_epi16(in0, in1); 96347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s1 = _mm_unpackhi_epi16(in0, in1); 96447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s2 = _mm_unpacklo_epi16(in2, in3); 96547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s3 = _mm_unpackhi_epi16(in2, in3); 96647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s4 = _mm_unpacklo_epi16(in4, in5); 96747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s5 = _mm_unpackhi_epi16(in4, in5); 96847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s6 = _mm_unpacklo_epi16(in6, in7); 96947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s7 = _mm_unpackhi_epi16(in6, in7); 97047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 97147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); 97247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); 97347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); 97447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); 97547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); 97647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); 97747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); 97847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); 97947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); 98047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); 98147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); 98247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); 98347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); 98447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); 98547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); 98647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); 98747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 98847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // addition 98947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org w0 = _mm_add_epi32(u0, u8); 99047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org w1 = _mm_add_epi32(u1, u9); 99147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org w2 = _mm_add_epi32(u2, u10); 99247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org w3 = _mm_add_epi32(u3, u11); 99347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org w4 = _mm_add_epi32(u4, u12); 99447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org w5 = _mm_add_epi32(u5, u13); 99547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org w6 = _mm_add_epi32(u6, u14); 99647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org w7 = _mm_add_epi32(u7, u15); 99747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org w8 = _mm_sub_epi32(u0, u8); 99847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org w9 = _mm_sub_epi32(u1, u9); 99947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org w10 = _mm_sub_epi32(u2, u10); 100047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org w11 = _mm_sub_epi32(u3, u11); 100147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org w12 = _mm_sub_epi32(u4, u12); 100247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org w13 = _mm_sub_epi32(u5, u13); 100347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org w14 = _mm_sub_epi32(u6, u14); 100447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org w15 = _mm_sub_epi32(u7, u15); 100547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 100647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // shift and rounding 100747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 100847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 100947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 101047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 101147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 101247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 101347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 101447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 101547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); 101647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); 101747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); 101847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); 101947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); 102047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); 102147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); 102247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); 102347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 102447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 102547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 102647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 102747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 102847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 102947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 103047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 103147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 103247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); 103347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); 103447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); 103547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); 103647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); 103747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); 103847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); 103947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); 104047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 104147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // back to 16-bit and pack 8 integers into __m128i 104247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[0] = _mm_packs_epi32(u0, u1); 104347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[1] = _mm_packs_epi32(u2, u3); 104447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[2] = _mm_packs_epi32(u4, u5); 104547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[3] = _mm_packs_epi32(u6, u7); 104647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[4] = _mm_packs_epi32(u8, u9); 104747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[5] = _mm_packs_epi32(u10, u11); 104847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[6] = _mm_packs_epi32(u12, u13); 104947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[7] = _mm_packs_epi32(u14, u15); 105047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 105147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // stage 2 105247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s0 = _mm_add_epi16(in[0], in[2]); 105347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s1 = _mm_add_epi16(in[1], in[3]); 105447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s2 = _mm_sub_epi16(in[0], in[2]); 105547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s3 = _mm_sub_epi16(in[1], in[3]); 105647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u0 = _mm_unpacklo_epi16(in[4], in[5]); 105747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u1 = _mm_unpackhi_epi16(in[4], in[5]); 105847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u2 = _mm_unpacklo_epi16(in[6], in[7]); 105947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u3 = _mm_unpackhi_epi16(in[6], in[7]); 106047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 106147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); 106247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); 106347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); 106447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); 106547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); 106647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); 106747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); 106847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); 106947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 107047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org w0 = _mm_add_epi32(v0, v4); 107147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org w1 = _mm_add_epi32(v1, v5); 107247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org w2 = _mm_add_epi32(v2, v6); 107347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org w3 = _mm_add_epi32(v3, v7); 107447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org w4 = _mm_sub_epi32(v0, v4); 107547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org w5 = _mm_sub_epi32(v1, v5); 107647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org w6 = _mm_sub_epi32(v2, v6); 107747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org w7 = _mm_sub_epi32(v3, v7); 107847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 107947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 108047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 108147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 108247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 108347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 108447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 108547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 108647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 108747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 108847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 108947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 109047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 109147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 109247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 109347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 109447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 109547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 109647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 109747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // back to 16-bit intergers 109847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s4 = _mm_packs_epi32(u0, u1); 109947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s5 = _mm_packs_epi32(u2, u3); 110047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s6 = _mm_packs_epi32(u4, u5); 110147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s7 = _mm_packs_epi32(u6, u7); 110247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 110347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // stage 3 110447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u0 = _mm_unpacklo_epi16(s2, s3); 110547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u1 = _mm_unpackhi_epi16(s2, s3); 110647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u2 = _mm_unpacklo_epi16(s6, s7); 110747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u3 = _mm_unpackhi_epi16(s6, s7); 110847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 110947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); 111047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); 111147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); 111247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); 111347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); 111447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); 111547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); 111647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); 111747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 111847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 111947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 112047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 112147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 112247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); 112347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); 112447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); 112547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); 112647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 112747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 112847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 112947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 113047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 113147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); 113247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); 113347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); 113447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); 113547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 113647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s2 = _mm_packs_epi32(v0, v1); 113747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s3 = _mm_packs_epi32(v2, v3); 113847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s6 = _mm_packs_epi32(v4, v5); 113947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s7 = _mm_packs_epi32(v6, v7); 114047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 114147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // FIXME(jingning): do subtract using bit inversion? 114247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[0] = s0; 114347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[1] = _mm_sub_epi16(k__const_0, s4); 114447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[2] = s6; 114547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[3] = _mm_sub_epi16(k__const_0, s2); 114647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[4] = s3; 114747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[5] = _mm_sub_epi16(k__const_0, s7); 114847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[6] = s5; 114947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[7] = _mm_sub_epi16(k__const_0, s1); 115047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 115147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // transpose 115247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org array_transpose_8x8(in, in); 115347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org} 115447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 115576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.orgvoid vp9_fht8x8_sse2(const int16_t *input, int16_t *output, 115676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org int stride, int tx_type) { 115747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i in[8]; 115876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 115947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org switch (tx_type) { 116076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org case DCT_DCT: 116176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org vp9_fdct8x8_sse2(input, output, stride); 116247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org break; 116376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org case ADST_DCT: 116476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org load_buffer_8x8(input, in, stride); 1165dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org fadst8_sse2(in); 1166dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org fdct8_sse2(in); 116776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org right_shift_8x8(in, 1); 116876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org write_buffer_8x8(output, in, 8); 116947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org break; 117076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org case DCT_ADST: 117176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org load_buffer_8x8(input, in, stride); 1172dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org fdct8_sse2(in); 1173dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org fadst8_sse2(in); 117476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org right_shift_8x8(in, 1); 117576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org write_buffer_8x8(output, in, 8); 117647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org break; 117776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org case ADST_ADST: 117876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org load_buffer_8x8(input, in, stride); 1179dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org fadst8_sse2(in); 1180dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org fadst8_sse2(in); 118176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org right_shift_8x8(in, 1); 118276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org write_buffer_8x8(output, in, 8); 118347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org break; 118447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org default: 118547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org assert(0); 118647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org break; 11873f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org } 11883f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org} 11893f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org 119088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.orgvoid vp9_fdct16x16_1_sse2(const int16_t *input, int16_t *output, int stride) { 119188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org __m128i in0, in1, in2, in3; 119288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org __m128i u0, u1; 119388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org __m128i sum = _mm_setzero_si128(); 119488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org int i; 119588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 119688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org for (i = 0; i < 2; ++i) { 119788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org input += 8 * i; 119888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); 119988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); 120088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); 120188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); 120288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 120388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org u0 = _mm_add_epi16(in0, in1); 120488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org u1 = _mm_add_epi16(in2, in3); 120588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org sum = _mm_add_epi16(sum, u0); 120688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 120788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); 120888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); 120988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); 121088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); 121188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 121288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org sum = _mm_add_epi16(sum, u1); 121388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org u0 = _mm_add_epi16(in0, in1); 121488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org u1 = _mm_add_epi16(in2, in3); 121588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org sum = _mm_add_epi16(sum, u0); 121688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 121788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in0 = _mm_load_si128((const __m128i *)(input + 8 * stride)); 121888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in1 = _mm_load_si128((const __m128i *)(input + 9 * stride)); 121988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in2 = _mm_load_si128((const __m128i *)(input + 10 * stride)); 122088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in3 = _mm_load_si128((const __m128i *)(input + 11 * stride)); 122188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 122288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org sum = _mm_add_epi16(sum, u1); 122388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org u0 = _mm_add_epi16(in0, in1); 122488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org u1 = _mm_add_epi16(in2, in3); 122588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org sum = _mm_add_epi16(sum, u0); 122688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 122788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in0 = _mm_load_si128((const __m128i *)(input + 12 * stride)); 122888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in1 = _mm_load_si128((const __m128i *)(input + 13 * stride)); 122988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in2 = _mm_load_si128((const __m128i *)(input + 14 * stride)); 123088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in3 = _mm_load_si128((const __m128i *)(input + 15 * stride)); 123188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 123288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org sum = _mm_add_epi16(sum, u1); 123388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org u0 = _mm_add_epi16(in0, in1); 123488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org u1 = _mm_add_epi16(in2, in3); 123588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org sum = _mm_add_epi16(sum, u0); 123688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 123788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org sum = _mm_add_epi16(sum, u1); 123888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org } 123988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 124088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org u0 = _mm_setzero_si128(); 124188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in0 = _mm_unpacklo_epi16(u0, sum); 124288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in1 = _mm_unpackhi_epi16(u0, sum); 124388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in0 = _mm_srai_epi32(in0, 16); 124488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in1 = _mm_srai_epi32(in1, 16); 124588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 124688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org sum = _mm_add_epi32(in0, in1); 124788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in0 = _mm_unpacklo_epi32(sum, u0); 124888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in1 = _mm_unpackhi_epi32(sum, u0); 124988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 125088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org sum = _mm_add_epi32(in0, in1); 125188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in0 = _mm_srli_si128(sum, 8); 125288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 125388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in1 = _mm_add_epi32(sum, in0); 125488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in1 = _mm_srai_epi32(in1, 1); 125588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org _mm_store_si128((__m128i *)(output), in1); 125688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org} 125788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 1258ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgvoid vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { 12593f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // The 2D transform is done with two passes which are actually pretty 12603f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // similar. In the first one, we transform the columns and transpose 12613f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // the results. In the second one, we transform the rows. To achieve that, 1262411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // as the first pass results are transposed, we transpose the columns (that 12633f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // is the transposed rows) and transpose the results (so that it goes back 12643f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // in normal/row positions). 12653f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org int pass; 12663f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // We need an intermediate buffer between passes. 126747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256); 1268ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org const int16_t *in = input; 12693f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org int16_t *out = intermediate; 12703f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Constants 12713f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // When we use them, in one case, they are all the same. In all others 12723f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // it's a pair of them that we need to repeat four times. This is done 12733f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // by constructing the 32 bit constant corresponding to that pair. 12743f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 12753f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 12763f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 127788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); 12783f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 12793f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 12803f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 12813f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 12823f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 12833f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); 12843f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); 12853f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); 12863f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); 12873f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); 12883f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); 12893f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); 12903f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); 12913f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 12923f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i kOne = _mm_set1_epi16(1); 12933f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Do the two transform/transpose passes 12943f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org for (pass = 0; pass < 2; ++pass) { 12953f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // We process eight columns (transposed rows in second pass) at a time. 12963f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org int column_start; 12973f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org for (column_start = 0; column_start < 16; column_start += 8) { 12983f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org __m128i in00, in01, in02, in03, in04, in05, in06, in07; 12993f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org __m128i in08, in09, in10, in11, in12, in13, in14, in15; 13003f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org __m128i input0, input1, input2, input3, input4, input5, input6, input7; 13013f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org __m128i step1_0, step1_1, step1_2, step1_3; 13023f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org __m128i step1_4, step1_5, step1_6, step1_7; 13033f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; 13043f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org __m128i step3_0, step3_1, step3_2, step3_3; 13053f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org __m128i step3_4, step3_5, step3_6, step3_7; 13063f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org __m128i res00, res01, res02, res03, res04, res05, res06, res07; 13073f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org __m128i res08, res09, res10, res11, res12, res13, res14, res15; 13083f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Load and pre-condition input. 13093f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org if (0 == pass) { 131047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in00 = _mm_load_si128((const __m128i *)(in + 0 * stride)); 131147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in01 = _mm_load_si128((const __m128i *)(in + 1 * stride)); 131247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in02 = _mm_load_si128((const __m128i *)(in + 2 * stride)); 131347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in03 = _mm_load_si128((const __m128i *)(in + 3 * stride)); 131447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in04 = _mm_load_si128((const __m128i *)(in + 4 * stride)); 131547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in05 = _mm_load_si128((const __m128i *)(in + 5 * stride)); 131647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in06 = _mm_load_si128((const __m128i *)(in + 6 * stride)); 131747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in07 = _mm_load_si128((const __m128i *)(in + 7 * stride)); 131847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in08 = _mm_load_si128((const __m128i *)(in + 8 * stride)); 131947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in09 = _mm_load_si128((const __m128i *)(in + 9 * stride)); 132047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in10 = _mm_load_si128((const __m128i *)(in + 10 * stride)); 132147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in11 = _mm_load_si128((const __m128i *)(in + 11 * stride)); 132247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in12 = _mm_load_si128((const __m128i *)(in + 12 * stride)); 132347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in13 = _mm_load_si128((const __m128i *)(in + 13 * stride)); 132447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in14 = _mm_load_si128((const __m128i *)(in + 14 * stride)); 132547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in15 = _mm_load_si128((const __m128i *)(in + 15 * stride)); 13263f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // x = x << 2 13273f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in00 = _mm_slli_epi16(in00, 2); 13283f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in01 = _mm_slli_epi16(in01, 2); 13293f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in02 = _mm_slli_epi16(in02, 2); 13303f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in03 = _mm_slli_epi16(in03, 2); 13313f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in04 = _mm_slli_epi16(in04, 2); 13323f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in05 = _mm_slli_epi16(in05, 2); 13333f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in06 = _mm_slli_epi16(in06, 2); 13343f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in07 = _mm_slli_epi16(in07, 2); 13353f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in08 = _mm_slli_epi16(in08, 2); 13363f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in09 = _mm_slli_epi16(in09, 2); 13373f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in10 = _mm_slli_epi16(in10, 2); 13383f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in11 = _mm_slli_epi16(in11, 2); 13393f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in12 = _mm_slli_epi16(in12, 2); 13403f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in13 = _mm_slli_epi16(in13, 2); 13413f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in14 = _mm_slli_epi16(in14, 2); 13423f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in15 = _mm_slli_epi16(in15, 2); 13433f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org } else { 134447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in00 = _mm_load_si128((const __m128i *)(in + 0 * 16)); 134547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in01 = _mm_load_si128((const __m128i *)(in + 1 * 16)); 134647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in02 = _mm_load_si128((const __m128i *)(in + 2 * 16)); 134747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in03 = _mm_load_si128((const __m128i *)(in + 3 * 16)); 134847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in04 = _mm_load_si128((const __m128i *)(in + 4 * 16)); 134947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in05 = _mm_load_si128((const __m128i *)(in + 5 * 16)); 135047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in06 = _mm_load_si128((const __m128i *)(in + 6 * 16)); 135147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in07 = _mm_load_si128((const __m128i *)(in + 7 * 16)); 135247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in08 = _mm_load_si128((const __m128i *)(in + 8 * 16)); 135347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in09 = _mm_load_si128((const __m128i *)(in + 9 * 16)); 135447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in10 = _mm_load_si128((const __m128i *)(in + 10 * 16)); 135547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in11 = _mm_load_si128((const __m128i *)(in + 11 * 16)); 135647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in12 = _mm_load_si128((const __m128i *)(in + 12 * 16)); 135747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in13 = _mm_load_si128((const __m128i *)(in + 13 * 16)); 135847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in14 = _mm_load_si128((const __m128i *)(in + 14 * 16)); 135947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in15 = _mm_load_si128((const __m128i *)(in + 15 * 16)); 13603f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // x = (x + 1) >> 2 13613f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in00 = _mm_add_epi16(in00, kOne); 13623f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in01 = _mm_add_epi16(in01, kOne); 13633f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in02 = _mm_add_epi16(in02, kOne); 13643f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in03 = _mm_add_epi16(in03, kOne); 13653f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in04 = _mm_add_epi16(in04, kOne); 13663f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in05 = _mm_add_epi16(in05, kOne); 13673f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in06 = _mm_add_epi16(in06, kOne); 13683f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in07 = _mm_add_epi16(in07, kOne); 13693f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in08 = _mm_add_epi16(in08, kOne); 13703f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in09 = _mm_add_epi16(in09, kOne); 13713f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in10 = _mm_add_epi16(in10, kOne); 13723f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in11 = _mm_add_epi16(in11, kOne); 13733f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in12 = _mm_add_epi16(in12, kOne); 13743f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in13 = _mm_add_epi16(in13, kOne); 13753f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in14 = _mm_add_epi16(in14, kOne); 13763f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in15 = _mm_add_epi16(in15, kOne); 13773f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in00 = _mm_srai_epi16(in00, 2); 13783f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in01 = _mm_srai_epi16(in01, 2); 13793f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in02 = _mm_srai_epi16(in02, 2); 13803f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in03 = _mm_srai_epi16(in03, 2); 13813f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in04 = _mm_srai_epi16(in04, 2); 13823f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in05 = _mm_srai_epi16(in05, 2); 13833f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in06 = _mm_srai_epi16(in06, 2); 13843f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in07 = _mm_srai_epi16(in07, 2); 13853f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in08 = _mm_srai_epi16(in08, 2); 13863f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in09 = _mm_srai_epi16(in09, 2); 13873f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in10 = _mm_srai_epi16(in10, 2); 13883f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in11 = _mm_srai_epi16(in11, 2); 13893f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in12 = _mm_srai_epi16(in12, 2); 13903f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in13 = _mm_srai_epi16(in13, 2); 13913f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in14 = _mm_srai_epi16(in14, 2); 13923f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in15 = _mm_srai_epi16(in15, 2); 13933f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org } 13943f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in += 8; 13953f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Calculate input for the first 8 results. 13963f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org { 13973f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org input0 = _mm_add_epi16(in00, in15); 13983f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org input1 = _mm_add_epi16(in01, in14); 13993f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org input2 = _mm_add_epi16(in02, in13); 14003f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org input3 = _mm_add_epi16(in03, in12); 14013f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org input4 = _mm_add_epi16(in04, in11); 14023f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org input5 = _mm_add_epi16(in05, in10); 14033f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org input6 = _mm_add_epi16(in06, in09); 14043f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org input7 = _mm_add_epi16(in07, in08); 14053f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org } 14063f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Calculate input for the next 8 results. 14073f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org { 14083f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org step1_0 = _mm_sub_epi16(in07, in08); 14093f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org step1_1 = _mm_sub_epi16(in06, in09); 14103f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org step1_2 = _mm_sub_epi16(in05, in10); 14113f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org step1_3 = _mm_sub_epi16(in04, in11); 14123f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org step1_4 = _mm_sub_epi16(in03, in12); 14133f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org step1_5 = _mm_sub_epi16(in02, in13); 14143f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org step1_6 = _mm_sub_epi16(in01, in14); 14153f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org step1_7 = _mm_sub_epi16(in00, in15); 14163f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org } 1417dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org // Work on the first eight values; fdct8(input, even_results); 14183f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org { 1419411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // Add/subtract 14203f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i q0 = _mm_add_epi16(input0, input7); 14213f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i q1 = _mm_add_epi16(input1, input6); 14223f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i q2 = _mm_add_epi16(input2, input5); 14233f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i q3 = _mm_add_epi16(input3, input4); 14243f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i q4 = _mm_sub_epi16(input3, input4); 14253f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i q5 = _mm_sub_epi16(input2, input5); 14263f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i q6 = _mm_sub_epi16(input1, input6); 14273f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i q7 = _mm_sub_epi16(input0, input7); 14283f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Work on first four results 14293f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org { 1430411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // Add/subtract 14313f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i r0 = _mm_add_epi16(q0, q3); 14323f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i r1 = _mm_add_epi16(q1, q2); 14333f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i r2 = _mm_sub_epi16(q1, q2); 14343f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i r3 = _mm_sub_epi16(q0, q3); 14353f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Interleave to do the multiply by constants which gets us 14363f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // into 32 bits. 14373f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t0 = _mm_unpacklo_epi16(r0, r1); 14383f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t1 = _mm_unpackhi_epi16(r0, r1); 14393f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t2 = _mm_unpacklo_epi16(r2, r3); 14403f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t3 = _mm_unpackhi_epi16(r2, r3); 14413f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); 14423f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); 14433f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); 14443f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); 14453f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); 14463f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); 14473f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); 14483f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); 14493f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // dct_const_round_shift 14503f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 14513f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 14523f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 14533f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 14543f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 14553f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 14563f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 14573f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 14583f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 14593f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 14603f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 14613f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 14623f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 14633f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 14643f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 14653f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 14663f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Combine 14673f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org res00 = _mm_packs_epi32(w0, w1); 14683f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org res08 = _mm_packs_epi32(w2, w3); 14693f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org res04 = _mm_packs_epi32(w4, w5); 14703f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org res12 = _mm_packs_epi32(w6, w7); 14713f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org } 14723f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Work on next four results 14733f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org { 14743f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Interleave to do the multiply by constants which gets us 14753f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // into 32 bits. 14763f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i d0 = _mm_unpacklo_epi16(q6, q5); 14773f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i d1 = _mm_unpackhi_epi16(q6, q5); 14783f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); 14793f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); 14803f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); 14813f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); 14823f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // dct_const_round_shift 14833f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); 14843f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); 14853f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); 14863f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); 14873f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); 14883f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); 14893f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); 14903f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); 14913f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Combine 14923f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i r0 = _mm_packs_epi32(s0, s1); 14933f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i r1 = _mm_packs_epi32(s2, s3); 1494411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // Add/subtract 14953f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i x0 = _mm_add_epi16(q4, r0); 14963f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i x1 = _mm_sub_epi16(q4, r0); 14973f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i x2 = _mm_sub_epi16(q7, r1); 14983f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i x3 = _mm_add_epi16(q7, r1); 14993f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Interleave to do the multiply by constants which gets us 15003f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // into 32 bits. 15013f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t0 = _mm_unpacklo_epi16(x0, x3); 15023f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t1 = _mm_unpackhi_epi16(x0, x3); 15033f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t2 = _mm_unpacklo_epi16(x1, x2); 15043f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t3 = _mm_unpackhi_epi16(x1, x2); 15053f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); 15063f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); 15073f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); 15083f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); 15093f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); 15103f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); 15113f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); 15123f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); 15133f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // dct_const_round_shift 15143f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 15153f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 15163f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 15173f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 15183f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 15193f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 15203f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 15213f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 15223f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 15233f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 15243f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 15253f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 15263f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 15273f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 15283f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 15293f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 15303f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Combine 15313f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org res02 = _mm_packs_epi32(w0, w1); 15323f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org res14 = _mm_packs_epi32(w2, w3); 15333f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org res10 = _mm_packs_epi32(w4, w5); 15343f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org res06 = _mm_packs_epi32(w6, w7); 15353f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org } 15363f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org } 15373f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Work on the next eight values; step1 -> odd_results 15383f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org { 15393f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // step 2 15403f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org { 15413f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); 15423f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); 15433f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); 15443f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); 15453f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16); 15463f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16); 15473f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16); 15483f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16); 15493f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // dct_const_round_shift 15503f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 15513f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 15523f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 15533f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 15543f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 15553f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 15563f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 15573f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 15583f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Combine 15593f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org step2_2 = _mm_packs_epi32(w0, w1); 15603f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org step2_3 = _mm_packs_epi32(w2, w3); 15613f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org } 15623f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org { 15633f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); 15643f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); 15653f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); 15663f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); 15673f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); 15683f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); 15693f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16); 15703f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16); 15713f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // dct_const_round_shift 15723f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 15733f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 15743f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 15753f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 15763f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 15773f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 15783f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 15793f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 15803f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Combine 15813f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org step2_5 = _mm_packs_epi32(w0, w1); 15823f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org step2_4 = _mm_packs_epi32(w2, w3); 15833f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org } 15843f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // step 3 15853f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org { 15863f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org step3_0 = _mm_add_epi16(step1_0, step2_3); 15873f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org step3_1 = _mm_add_epi16(step1_1, step2_2); 15883f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org step3_2 = _mm_sub_epi16(step1_1, step2_2); 15893f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org step3_3 = _mm_sub_epi16(step1_0, step2_3); 15903f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org step3_4 = _mm_sub_epi16(step1_7, step2_4); 15913f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org step3_5 = _mm_sub_epi16(step1_6, step2_5); 15923f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org step3_6 = _mm_add_epi16(step1_6, step2_5); 15933f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org step3_7 = _mm_add_epi16(step1_7, step2_4); 15943f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org } 15953f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // step 4 15963f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org { 15973f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); 15983f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); 15993f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); 16003f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); 16013f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24); 16023f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24); 160388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p24_p08); 160488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p24_p08); 16053f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // dct_const_round_shift 16063f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 16073f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 16083f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 16093f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 16103f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 16113f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 16123f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 16133f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 16143f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Combine 16153f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org step2_1 = _mm_packs_epi32(w0, w1); 16163f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org step2_2 = _mm_packs_epi32(w2, w3); 16173f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org } 16183f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org { 16193f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); 16203f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); 16213f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); 16223f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); 16233f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08); 16243f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08); 162588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p08_m24); 162688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p08_m24); 16273f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // dct_const_round_shift 16283f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 16293f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 16303f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 16313f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 16323f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 16333f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 16343f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 16353f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 16363f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Combine 16373f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org step2_6 = _mm_packs_epi32(w0, w1); 16383f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org step2_5 = _mm_packs_epi32(w2, w3); 16393f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org } 16403f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // step 5 16413f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org { 16423f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org step1_0 = _mm_add_epi16(step3_0, step2_1); 16433f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org step1_1 = _mm_sub_epi16(step3_0, step2_1); 164488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org step1_2 = _mm_add_epi16(step3_3, step2_2); 164588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org step1_3 = _mm_sub_epi16(step3_3, step2_2); 164688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org step1_4 = _mm_sub_epi16(step3_4, step2_5); 164788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org step1_5 = _mm_add_epi16(step3_4, step2_5); 16483f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org step1_6 = _mm_sub_epi16(step3_7, step2_6); 16493f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org step1_7 = _mm_add_epi16(step3_7, step2_6); 16503f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org } 16513f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // step 6 16523f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org { 16533f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); 16543f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); 16553f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); 16563f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); 16573f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02); 16583f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02); 16593f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18); 16603f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18); 16613f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // dct_const_round_shift 16623f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 16633f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 16643f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 16653f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 16663f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 16673f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 16683f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 16693f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 16703f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Combine 16713f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org res01 = _mm_packs_epi32(w0, w1); 16723f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org res09 = _mm_packs_epi32(w2, w3); 16733f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org } 16743f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org { 16753f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); 16763f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); 16773f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); 16783f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); 16793f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10); 16803f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10); 16813f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26); 16823f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26); 16833f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // dct_const_round_shift 16843f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 16853f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 16863f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 16873f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 16883f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 16893f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 16903f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 16913f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 16923f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Combine 16933f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org res05 = _mm_packs_epi32(w0, w1); 16943f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org res13 = _mm_packs_epi32(w2, w3); 16953f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org } 16963f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org { 16973f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); 16983f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); 16993f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); 17003f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); 17013f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22); 17023f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22); 17033f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06); 17043f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06); 17053f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // dct_const_round_shift 17063f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 17073f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 17083f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 17093f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 17103f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 17113f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 17123f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 17133f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 17143f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Combine 17153f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org res11 = _mm_packs_epi32(w0, w1); 17163f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org res03 = _mm_packs_epi32(w2, w3); 17173f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org } 17183f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org { 17193f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); 17203f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); 17213f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); 17223f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); 17233f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30); 17243f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30); 17253f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14); 17263f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14); 17273f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // dct_const_round_shift 17283f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 17293f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 17303f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 17313f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 17323f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 17333f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 17343f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 17353f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 17363f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Combine 17373f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org res15 = _mm_packs_epi32(w0, w1); 17383f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org res07 = _mm_packs_epi32(w2, w3); 17393f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org } 17403f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org } 17413f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Transpose the results, do it as two 8x8 transposes. 17423f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org { 17433f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 00 01 02 03 04 05 06 07 17443f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 10 11 12 13 14 15 16 17 17453f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 20 21 22 23 24 25 26 27 17463f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 30 31 32 33 34 35 36 37 17473f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 40 41 42 43 44 45 46 47 17483f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 50 51 52 53 54 55 56 57 17493f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 60 61 62 63 64 65 66 67 17503f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 70 71 72 73 74 75 76 77 17513f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01); 17523f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03); 17533f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01); 17543f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03); 17553f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05); 17563f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07); 17573f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05); 17583f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07); 17593f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 00 10 01 11 02 12 03 13 17603f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 20 30 21 31 22 32 23 33 17613f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 04 14 05 15 06 16 07 17 17623f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 24 34 25 35 26 36 27 37 17633f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 40 50 41 51 42 52 43 53 17643f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 60 70 61 71 62 72 63 73 17653f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 54 54 55 55 56 56 57 57 17663f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 64 74 65 75 66 76 67 77 17673f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 17683f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); 17693f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 17703f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); 17713f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); 17723f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 17733f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); 17743f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 17753f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 00 10 20 30 01 11 21 31 17763f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 40 50 60 70 41 51 61 71 17773f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 02 12 22 32 03 13 23 33 17783f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 42 52 62 72 43 53 63 73 17793f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 04 14 24 34 05 15 21 36 17803f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 44 54 64 74 45 55 61 76 17813f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 06 16 26 36 07 17 27 37 17823f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 46 56 66 76 47 57 67 77 17833f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); 17843f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); 17853f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); 17863f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); 17873f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); 17883f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); 17893f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); 17903f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); 17913f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 00 10 20 30 40 50 60 70 17923f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 01 11 21 31 41 51 61 71 17933f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 02 12 22 32 42 52 62 72 17943f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 03 13 23 33 43 53 63 73 17953f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 04 14 24 34 44 54 64 74 17963f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 05 15 25 35 45 55 65 75 17973f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 06 16 26 36 46 56 66 76 17983f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 07 17 27 37 47 57 67 77 179910a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0); 180010a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1); 180110a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2); 180210a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3); 180310a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4); 180410a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5); 180510a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6); 180610a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7); 18073f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org } 18083f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org { 18093f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 00 01 02 03 04 05 06 07 18103f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 10 11 12 13 14 15 16 17 18113f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 20 21 22 23 24 25 26 27 18123f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 30 31 32 33 34 35 36 37 18133f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 40 41 42 43 44 45 46 47 18143f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 50 51 52 53 54 55 56 57 18153f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 60 61 62 63 64 65 66 67 18163f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 70 71 72 73 74 75 76 77 18173f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09); 18183f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11); 18193f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09); 18203f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11); 18213f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13); 18223f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15); 18233f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13); 18243f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15); 18253f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 00 10 01 11 02 12 03 13 18263f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 20 30 21 31 22 32 23 33 18273f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 04 14 05 15 06 16 07 17 18283f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 24 34 25 35 26 36 27 37 18293f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 40 50 41 51 42 52 43 53 18303f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 60 70 61 71 62 72 63 73 18313f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 54 54 55 55 56 56 57 57 18323f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 64 74 65 75 66 76 67 77 18333f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 18343f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); 18353f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 18363f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); 18373f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); 18383f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 18393f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); 18403f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 18413f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 00 10 20 30 01 11 21 31 18423f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 40 50 60 70 41 51 61 71 18433f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 02 12 22 32 03 13 23 33 18443f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 42 52 62 72 43 53 63 73 18453f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 04 14 24 34 05 15 21 36 18463f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 44 54 64 74 45 55 61 76 18473f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 06 16 26 36 07 17 27 37 18483f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 46 56 66 76 47 57 67 77 18493f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); 18503f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); 18513f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); 18523f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); 18533f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); 18543f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); 18553f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); 18563f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); 18573f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 00 10 20 30 40 50 60 70 18583f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 01 11 21 31 41 51 61 71 18593f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 02 12 22 32 42 52 62 72 18603f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 03 13 23 33 43 53 63 73 18613f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 04 14 24 34 44 54 64 74 18623f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 05 15 25 35 45 55 65 75 18633f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 06 16 26 36 46 56 66 76 18643f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // 07 17 27 37 47 57 67 77 18653f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Store results 186647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org _mm_store_si128((__m128i *)(out + 8 + 0 * 16), tr2_0); 186747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org _mm_store_si128((__m128i *)(out + 8 + 1 * 16), tr2_1); 186847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org _mm_store_si128((__m128i *)(out + 8 + 2 * 16), tr2_2); 186947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org _mm_store_si128((__m128i *)(out + 8 + 3 * 16), tr2_3); 187047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org _mm_store_si128((__m128i *)(out + 8 + 4 * 16), tr2_4); 187147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org _mm_store_si128((__m128i *)(out + 8 + 5 * 16), tr2_5); 187247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org _mm_store_si128((__m128i *)(out + 8 + 6 * 16), tr2_6); 187347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7); 18743f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org } 18753f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org out += 8*16; 18763f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org } 18773f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org // Setup in/out for next pass. 18783f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org in = intermediate; 18793f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org out = output; 18803f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org } 18813f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org} 188247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 1883ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgstatic INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0, 188447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i *in1, int stride) { 188547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // load first 8 columns 188647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org load_buffer_8x8(input, in0, stride); 188747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org load_buffer_8x8(input + 8 * stride, in0 + 8, stride); 188847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 188947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org input += 8; 189047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // load second 8 columns 189147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org load_buffer_8x8(input, in1, stride); 189247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org load_buffer_8x8(input + 8 * stride, in1 + 8, stride); 189347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org} 189447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 189547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.orgstatic INLINE void write_buffer_16x16(int16_t *output, __m128i *in0, 189647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i *in1, int stride) { 189747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // write first 8 columns 189847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org write_buffer_8x8(output, in0, stride); 189947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org write_buffer_8x8(output + 8 * stride, in0 + 8, stride); 190047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // write second 8 columns 190147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org output += 8; 190247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org write_buffer_8x8(output, in1, stride); 190347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org write_buffer_8x8(output + 8 * stride, in1 + 8, stride); 190447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org} 190547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 190647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.orgstatic INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) { 190747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // perform rounding operations 190847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org right_shift_8x8(res0, 2); 190947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org right_shift_8x8(res0 + 8, 2); 191047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org right_shift_8x8(res1, 2); 191147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org right_shift_8x8(res1 + 8, 2); 191247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org} 191347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 1914dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.orgvoid fdct16_8col(__m128i *in) { 191547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // perform 16x16 1-D DCT for 8 columns 191647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i i[8], s[8], p[8], t[8], u[16], v[16]; 191747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 191847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 191947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 192047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 192188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); 192247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 192347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 192447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 192547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 192647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 192747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); 192847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); 192947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); 193047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); 193147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); 193247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); 193347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); 193447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); 193547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 193647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 193747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // stage 1 193847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org i[0] = _mm_add_epi16(in[0], in[15]); 193947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org i[1] = _mm_add_epi16(in[1], in[14]); 194047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org i[2] = _mm_add_epi16(in[2], in[13]); 194147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org i[3] = _mm_add_epi16(in[3], in[12]); 194247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org i[4] = _mm_add_epi16(in[4], in[11]); 194347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org i[5] = _mm_add_epi16(in[5], in[10]); 194447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org i[6] = _mm_add_epi16(in[6], in[9]); 194547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org i[7] = _mm_add_epi16(in[7], in[8]); 194647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 194747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[0] = _mm_sub_epi16(in[7], in[8]); 194847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[1] = _mm_sub_epi16(in[6], in[9]); 194947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[2] = _mm_sub_epi16(in[5], in[10]); 195047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[3] = _mm_sub_epi16(in[4], in[11]); 195147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[4] = _mm_sub_epi16(in[3], in[12]); 195247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[5] = _mm_sub_epi16(in[2], in[13]); 195347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[6] = _mm_sub_epi16(in[1], in[14]); 195447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[7] = _mm_sub_epi16(in[0], in[15]); 195547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 195647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org p[0] = _mm_add_epi16(i[0], i[7]); 195747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org p[1] = _mm_add_epi16(i[1], i[6]); 195847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org p[2] = _mm_add_epi16(i[2], i[5]); 195947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org p[3] = _mm_add_epi16(i[3], i[4]); 196047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org p[4] = _mm_sub_epi16(i[3], i[4]); 196147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org p[5] = _mm_sub_epi16(i[2], i[5]); 196247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org p[6] = _mm_sub_epi16(i[1], i[6]); 196347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org p[7] = _mm_sub_epi16(i[0], i[7]); 196447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 196547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[0] = _mm_add_epi16(p[0], p[3]); 196647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[1] = _mm_add_epi16(p[1], p[2]); 196747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[2] = _mm_sub_epi16(p[1], p[2]); 196847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[3] = _mm_sub_epi16(p[0], p[3]); 196947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 197047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[0] = _mm_unpacklo_epi16(u[0], u[1]); 197147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[1] = _mm_unpackhi_epi16(u[0], u[1]); 197247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[2] = _mm_unpacklo_epi16(u[2], u[3]); 197347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[3] = _mm_unpackhi_epi16(u[2], u[3]); 197447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 197547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); 197647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16); 197747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16); 197847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16); 197947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08); 198047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08); 198147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24); 198247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24); 198347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 198447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 198547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 198647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 198747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 198847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 198947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 199047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 199147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 199247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 199347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 199447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 199547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 199647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 199747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 199847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 199947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 200047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 200147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 200247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[0] = _mm_packs_epi32(u[0], u[1]); 200347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[4] = _mm_packs_epi32(u[4], u[5]); 200447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[8] = _mm_packs_epi32(u[2], u[3]); 200547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[12] = _mm_packs_epi32(u[6], u[7]); 200647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 200747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[0] = _mm_unpacklo_epi16(p[5], p[6]); 200847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[1] = _mm_unpackhi_epi16(p[5], p[6]); 200947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 201047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 201147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 201247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 201347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 201447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 201547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 201647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 201747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 201847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 201947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 202047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 202147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 202247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 202347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 202447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[0] = _mm_packs_epi32(v[0], v[1]); 202547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[1] = _mm_packs_epi32(v[2], v[3]); 202647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 202747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org t[0] = _mm_add_epi16(p[4], u[0]); 202847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org t[1] = _mm_sub_epi16(p[4], u[0]); 202947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org t[2] = _mm_sub_epi16(p[7], u[1]); 203047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org t[3] = _mm_add_epi16(p[7], u[1]); 203147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 203247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[0] = _mm_unpacklo_epi16(t[0], t[3]); 203347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[1] = _mm_unpackhi_epi16(t[0], t[3]); 203447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[2] = _mm_unpacklo_epi16(t[1], t[2]); 203547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[3] = _mm_unpackhi_epi16(t[1], t[2]); 203647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 203747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04); 203847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04); 203947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20); 204047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20); 204147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12); 204247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12); 204347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28); 204447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28); 204547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 204647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 204747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 204847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 204947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 205047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 205147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 205247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 205347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 205447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 205547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 205647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 205747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 205847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 205947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 206047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 206147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 206247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 206347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 206447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[2] = _mm_packs_epi32(v[0], v[1]); 206547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[6] = _mm_packs_epi32(v[4], v[5]); 206647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[10] = _mm_packs_epi32(v[2], v[3]); 206747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[14] = _mm_packs_epi32(v[6], v[7]); 206847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 206947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // stage 2 207047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[0] = _mm_unpacklo_epi16(s[2], s[5]); 207147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[1] = _mm_unpackhi_epi16(s[2], s[5]); 207247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[2] = _mm_unpacklo_epi16(s[3], s[4]); 207347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[3] = _mm_unpackhi_epi16(s[3], s[4]); 207447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 207547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 207647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 207747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 207847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 207947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 208047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 208147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 208247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 208347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 208447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 208547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 208647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 208747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 208847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 208947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 209047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 209147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 209247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 209347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 209447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 209547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 209647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 209747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 209847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 209947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 210047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 210147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 210247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org t[2] = _mm_packs_epi32(v[0], v[1]); 210347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org t[3] = _mm_packs_epi32(v[2], v[3]); 210447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org t[4] = _mm_packs_epi32(v[4], v[5]); 210547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org t[5] = _mm_packs_epi32(v[6], v[7]); 210647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 210747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // stage 3 210847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org p[0] = _mm_add_epi16(s[0], t[3]); 210947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org p[1] = _mm_add_epi16(s[1], t[2]); 211047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org p[2] = _mm_sub_epi16(s[1], t[2]); 211147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org p[3] = _mm_sub_epi16(s[0], t[3]); 211247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org p[4] = _mm_sub_epi16(s[7], t[4]); 211347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org p[5] = _mm_sub_epi16(s[6], t[5]); 211447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org p[6] = _mm_add_epi16(s[6], t[5]); 211547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org p[7] = _mm_add_epi16(s[7], t[4]); 211647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 211747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // stage 4 211847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[0] = _mm_unpacklo_epi16(p[1], p[6]); 211947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[1] = _mm_unpackhi_epi16(p[1], p[6]); 212047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[2] = _mm_unpacklo_epi16(p[2], p[5]); 212147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[3] = _mm_unpackhi_epi16(p[2], p[5]); 212247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 212347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24); 212447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24); 212588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08); 212688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08); 212788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24); 212888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24); 212947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08); 213047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08); 213147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 213247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 213347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 213447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 213547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 213647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 213747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 213847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 213947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 214047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 214147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 214247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 214347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 214447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 214547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 214647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 214747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 214847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 214947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 215047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org t[1] = _mm_packs_epi32(v[0], v[1]); 215147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org t[2] = _mm_packs_epi32(v[2], v[3]); 215247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org t[5] = _mm_packs_epi32(v[4], v[5]); 215347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org t[6] = _mm_packs_epi32(v[6], v[7]); 215447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 215547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // stage 5 215647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[0] = _mm_add_epi16(p[0], t[1]); 215747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[1] = _mm_sub_epi16(p[0], t[1]); 215888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org s[2] = _mm_add_epi16(p[3], t[2]); 215988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org s[3] = _mm_sub_epi16(p[3], t[2]); 216088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org s[4] = _mm_sub_epi16(p[4], t[5]); 216188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org s[5] = _mm_add_epi16(p[4], t[5]); 216247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[6] = _mm_sub_epi16(p[7], t[6]); 216347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[7] = _mm_add_epi16(p[7], t[6]); 216447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 216547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // stage 6 216647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[0] = _mm_unpacklo_epi16(s[0], s[7]); 216747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[1] = _mm_unpackhi_epi16(s[0], s[7]); 216847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[2] = _mm_unpacklo_epi16(s[1], s[6]); 216947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[3] = _mm_unpackhi_epi16(s[1], s[6]); 217047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[4] = _mm_unpacklo_epi16(s[2], s[5]); 217147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[5] = _mm_unpackhi_epi16(s[2], s[5]); 217247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[6] = _mm_unpacklo_epi16(s[3], s[4]); 217347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[7] = _mm_unpackhi_epi16(s[3], s[4]); 217447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 217547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02); 217647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02); 217747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18); 217847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18); 217947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10); 218047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10); 218147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26); 218247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26); 218347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06); 218447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06); 218547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22); 218647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22); 218747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14); 218847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14); 218947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30); 219047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30); 219147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 219247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 219347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 219447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 219547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 219647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 219747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 219847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 219947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 220047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 220147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 220247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 220347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 220447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 220547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 220647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 220747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 220847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 220947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 221047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 221147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 221247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 221347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 221447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 221547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 221647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 221747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 221847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 221947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 222047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 222147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 222247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 222347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 222447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 222547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 222647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[1] = _mm_packs_epi32(v[0], v[1]); 222747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[9] = _mm_packs_epi32(v[2], v[3]); 222847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[5] = _mm_packs_epi32(v[4], v[5]); 222947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[13] = _mm_packs_epi32(v[6], v[7]); 223047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[3] = _mm_packs_epi32(v[8], v[9]); 223147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[11] = _mm_packs_epi32(v[10], v[11]); 223247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[7] = _mm_packs_epi32(v[12], v[13]); 223347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[15] = _mm_packs_epi32(v[14], v[15]); 223447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org} 223547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 2236dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.orgvoid fadst16_8col(__m128i *in) { 223747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // perform 16x16 1-D ADST for 8 columns 223847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i s[16], x[16], u[32], v[32]; 223947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); 224047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); 224147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); 224247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); 224347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); 224447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); 224547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); 224647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); 224747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); 224847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); 224947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); 225047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); 225147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); 225247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); 225347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); 225447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); 225547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); 225647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); 225747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); 225847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); 225947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); 226047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); 226147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 226247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 226347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 226447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); 226547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 226647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 226747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 226847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 226947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i kZero = _mm_set1_epi16(0); 227047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 227147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[0] = _mm_unpacklo_epi16(in[15], in[0]); 227247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[1] = _mm_unpackhi_epi16(in[15], in[0]); 227347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[2] = _mm_unpacklo_epi16(in[13], in[2]); 227447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[3] = _mm_unpackhi_epi16(in[13], in[2]); 227547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[4] = _mm_unpacklo_epi16(in[11], in[4]); 227647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[5] = _mm_unpackhi_epi16(in[11], in[4]); 227747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[6] = _mm_unpacklo_epi16(in[9], in[6]); 227847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[7] = _mm_unpackhi_epi16(in[9], in[6]); 227947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[8] = _mm_unpacklo_epi16(in[7], in[8]); 228047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[9] = _mm_unpackhi_epi16(in[7], in[8]); 228147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[10] = _mm_unpacklo_epi16(in[5], in[10]); 228247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[11] = _mm_unpackhi_epi16(in[5], in[10]); 228347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[12] = _mm_unpacklo_epi16(in[3], in[12]); 228447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[13] = _mm_unpackhi_epi16(in[3], in[12]); 228547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[14] = _mm_unpacklo_epi16(in[1], in[14]); 228647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[15] = _mm_unpackhi_epi16(in[1], in[14]); 228747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 228847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); 228947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); 229047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); 229147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); 229247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); 229347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); 229447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); 229547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); 229647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); 229747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); 229847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); 229947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); 230047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); 230147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); 230247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); 230347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); 230447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); 230547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); 230647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); 230747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); 230847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); 230947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); 231047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); 231147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); 231247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); 231347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); 231447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); 231547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); 231647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); 231747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); 231847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); 231947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); 232047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 232147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[0] = _mm_add_epi32(v[0], v[16]); 232247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[1] = _mm_add_epi32(v[1], v[17]); 232347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[2] = _mm_add_epi32(v[2], v[18]); 232447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[3] = _mm_add_epi32(v[3], v[19]); 232547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[4] = _mm_add_epi32(v[4], v[20]); 232647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[5] = _mm_add_epi32(v[5], v[21]); 232747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[6] = _mm_add_epi32(v[6], v[22]); 232847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[7] = _mm_add_epi32(v[7], v[23]); 232947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[8] = _mm_add_epi32(v[8], v[24]); 233047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[9] = _mm_add_epi32(v[9], v[25]); 233147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[10] = _mm_add_epi32(v[10], v[26]); 233247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[11] = _mm_add_epi32(v[11], v[27]); 233347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[12] = _mm_add_epi32(v[12], v[28]); 233447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[13] = _mm_add_epi32(v[13], v[29]); 233547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[14] = _mm_add_epi32(v[14], v[30]); 233647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[15] = _mm_add_epi32(v[15], v[31]); 233747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[16] = _mm_sub_epi32(v[0], v[16]); 233847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[17] = _mm_sub_epi32(v[1], v[17]); 233947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[18] = _mm_sub_epi32(v[2], v[18]); 234047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[19] = _mm_sub_epi32(v[3], v[19]); 234147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[20] = _mm_sub_epi32(v[4], v[20]); 234247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[21] = _mm_sub_epi32(v[5], v[21]); 234347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[22] = _mm_sub_epi32(v[6], v[22]); 234447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[23] = _mm_sub_epi32(v[7], v[23]); 234547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[24] = _mm_sub_epi32(v[8], v[24]); 234647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[25] = _mm_sub_epi32(v[9], v[25]); 234747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[26] = _mm_sub_epi32(v[10], v[26]); 234847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[27] = _mm_sub_epi32(v[11], v[27]); 234947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[28] = _mm_sub_epi32(v[12], v[28]); 235047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[29] = _mm_sub_epi32(v[13], v[29]); 235147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[30] = _mm_sub_epi32(v[14], v[30]); 235247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[31] = _mm_sub_epi32(v[15], v[31]); 235347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 235447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 235547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 235647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 235747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 235847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 235947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 236047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 236147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 236247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 236347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 236447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 236547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 236647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 236747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 236847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 236947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 237047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); 237147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); 237247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); 237347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); 237447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); 237547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); 237647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); 237747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); 237847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); 237947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); 238047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); 238147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); 238247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); 238347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); 238447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); 238547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); 238647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 238747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 238847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 238947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 239047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 239147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 239247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 239347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 239447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 239547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 239647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 239747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 239847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 239947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 240047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 240147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 240247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 240347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); 240447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); 240547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); 240647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); 240747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); 240847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); 240947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); 241047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); 241147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); 241247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); 241347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); 241447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); 241547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); 241647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); 241747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); 241847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); 241947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 242047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[0] = _mm_packs_epi32(u[0], u[1]); 242147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[1] = _mm_packs_epi32(u[2], u[3]); 242247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[2] = _mm_packs_epi32(u[4], u[5]); 242347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[3] = _mm_packs_epi32(u[6], u[7]); 242447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[4] = _mm_packs_epi32(u[8], u[9]); 242547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[5] = _mm_packs_epi32(u[10], u[11]); 242647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[6] = _mm_packs_epi32(u[12], u[13]); 242747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[7] = _mm_packs_epi32(u[14], u[15]); 242847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[8] = _mm_packs_epi32(u[16], u[17]); 242947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[9] = _mm_packs_epi32(u[18], u[19]); 243047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[10] = _mm_packs_epi32(u[20], u[21]); 243147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[11] = _mm_packs_epi32(u[22], u[23]); 243247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[12] = _mm_packs_epi32(u[24], u[25]); 243347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[13] = _mm_packs_epi32(u[26], u[27]); 243447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[14] = _mm_packs_epi32(u[28], u[29]); 243547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[15] = _mm_packs_epi32(u[30], u[31]); 243647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 243747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // stage 2 243847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[0] = _mm_unpacklo_epi16(s[8], s[9]); 243947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[1] = _mm_unpackhi_epi16(s[8], s[9]); 244047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[2] = _mm_unpacklo_epi16(s[10], s[11]); 244147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[3] = _mm_unpackhi_epi16(s[10], s[11]); 244247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[4] = _mm_unpacklo_epi16(s[12], s[13]); 244347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[5] = _mm_unpackhi_epi16(s[12], s[13]); 244447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[6] = _mm_unpacklo_epi16(s[14], s[15]); 244547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[7] = _mm_unpackhi_epi16(s[14], s[15]); 244647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 244747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); 244847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); 244947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); 245047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); 245147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); 245247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); 245347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); 245447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); 245547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); 245647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); 245747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); 245847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); 245947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); 246047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); 246147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); 246247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); 246347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 246447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[0] = _mm_add_epi32(v[0], v[8]); 246547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[1] = _mm_add_epi32(v[1], v[9]); 246647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[2] = _mm_add_epi32(v[2], v[10]); 246747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[3] = _mm_add_epi32(v[3], v[11]); 246847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[4] = _mm_add_epi32(v[4], v[12]); 246947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[5] = _mm_add_epi32(v[5], v[13]); 247047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[6] = _mm_add_epi32(v[6], v[14]); 247147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[7] = _mm_add_epi32(v[7], v[15]); 247247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[8] = _mm_sub_epi32(v[0], v[8]); 247347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[9] = _mm_sub_epi32(v[1], v[9]); 247447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[10] = _mm_sub_epi32(v[2], v[10]); 247547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[11] = _mm_sub_epi32(v[3], v[11]); 247647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[12] = _mm_sub_epi32(v[4], v[12]); 247747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[13] = _mm_sub_epi32(v[5], v[13]); 247847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[14] = _mm_sub_epi32(v[6], v[14]); 247947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[15] = _mm_sub_epi32(v[7], v[15]); 248047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 248147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 248247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 248347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 248447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 248547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 248647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 248747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 248847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 248947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 249047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 249147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 249247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 249347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 249447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 249547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 249647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 249747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 249847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 249947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 250047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 250147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 250247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 250347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 250447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 250547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 250647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 250747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 250847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 250947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 251047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 251147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 251247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 251347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 251447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 251547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org x[0] = _mm_add_epi16(s[0], s[4]); 251647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org x[1] = _mm_add_epi16(s[1], s[5]); 251747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org x[2] = _mm_add_epi16(s[2], s[6]); 251847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org x[3] = _mm_add_epi16(s[3], s[7]); 251947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org x[4] = _mm_sub_epi16(s[0], s[4]); 252047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org x[5] = _mm_sub_epi16(s[1], s[5]); 252147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org x[6] = _mm_sub_epi16(s[2], s[6]); 252247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org x[7] = _mm_sub_epi16(s[3], s[7]); 252347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org x[8] = _mm_packs_epi32(u[0], u[1]); 252447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org x[9] = _mm_packs_epi32(u[2], u[3]); 252547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org x[10] = _mm_packs_epi32(u[4], u[5]); 252647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org x[11] = _mm_packs_epi32(u[6], u[7]); 252747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org x[12] = _mm_packs_epi32(u[8], u[9]); 252847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org x[13] = _mm_packs_epi32(u[10], u[11]); 252947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org x[14] = _mm_packs_epi32(u[12], u[13]); 253047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org x[15] = _mm_packs_epi32(u[14], u[15]); 253147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 253247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // stage 3 253347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[0] = _mm_unpacklo_epi16(x[4], x[5]); 253447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[1] = _mm_unpackhi_epi16(x[4], x[5]); 253547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[2] = _mm_unpacklo_epi16(x[6], x[7]); 253647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[3] = _mm_unpackhi_epi16(x[6], x[7]); 253747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[4] = _mm_unpacklo_epi16(x[12], x[13]); 253847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[5] = _mm_unpackhi_epi16(x[12], x[13]); 253947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[6] = _mm_unpacklo_epi16(x[14], x[15]); 254047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[7] = _mm_unpackhi_epi16(x[14], x[15]); 254147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 254247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); 254347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); 254447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); 254547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); 254647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); 254747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); 254847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); 254947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); 255047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); 255147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); 255247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); 255347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); 255447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); 255547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); 255647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); 255747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); 255847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 255947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[0] = _mm_add_epi32(v[0], v[4]); 256047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[1] = _mm_add_epi32(v[1], v[5]); 256147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[2] = _mm_add_epi32(v[2], v[6]); 256247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[3] = _mm_add_epi32(v[3], v[7]); 256347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[4] = _mm_sub_epi32(v[0], v[4]); 256447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[5] = _mm_sub_epi32(v[1], v[5]); 256547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[6] = _mm_sub_epi32(v[2], v[6]); 256647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[7] = _mm_sub_epi32(v[3], v[7]); 256747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[8] = _mm_add_epi32(v[8], v[12]); 256847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[9] = _mm_add_epi32(v[9], v[13]); 256947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[10] = _mm_add_epi32(v[10], v[14]); 257047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[11] = _mm_add_epi32(v[11], v[15]); 257147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[12] = _mm_sub_epi32(v[8], v[12]); 257247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[13] = _mm_sub_epi32(v[9], v[13]); 257347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[14] = _mm_sub_epi32(v[10], v[14]); 257447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[15] = _mm_sub_epi32(v[11], v[15]); 257547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 257647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 257747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 257847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 257947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 258047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 258147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 258247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 258347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 258447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 258547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 258647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 258747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 258847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 258947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 259047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 259147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 259247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 259347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 259447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 259547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 259647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 259747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 259847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 259947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 260047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 260147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 260247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 260347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 260447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 260547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 260647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 260747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 260847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 260947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 261047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[0] = _mm_add_epi16(x[0], x[2]); 261147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[1] = _mm_add_epi16(x[1], x[3]); 261247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[2] = _mm_sub_epi16(x[0], x[2]); 261347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[3] = _mm_sub_epi16(x[1], x[3]); 261447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[4] = _mm_packs_epi32(v[0], v[1]); 261547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[5] = _mm_packs_epi32(v[2], v[3]); 261647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[6] = _mm_packs_epi32(v[4], v[5]); 261747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[7] = _mm_packs_epi32(v[6], v[7]); 261847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[8] = _mm_add_epi16(x[8], x[10]); 261947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[9] = _mm_add_epi16(x[9], x[11]); 262047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[10] = _mm_sub_epi16(x[8], x[10]); 262147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[11] = _mm_sub_epi16(x[9], x[11]); 262247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[12] = _mm_packs_epi32(v[8], v[9]); 262347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[13] = _mm_packs_epi32(v[10], v[11]); 262447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[14] = _mm_packs_epi32(v[12], v[13]); 262547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org s[15] = _mm_packs_epi32(v[14], v[15]); 262647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 262747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // stage 4 262847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[0] = _mm_unpacklo_epi16(s[2], s[3]); 262947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[1] = _mm_unpackhi_epi16(s[2], s[3]); 263047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[2] = _mm_unpacklo_epi16(s[6], s[7]); 263147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[3] = _mm_unpackhi_epi16(s[6], s[7]); 263247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[4] = _mm_unpacklo_epi16(s[10], s[11]); 263347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[5] = _mm_unpackhi_epi16(s[10], s[11]); 263447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[6] = _mm_unpacklo_epi16(s[14], s[15]); 263547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[7] = _mm_unpackhi_epi16(s[14], s[15]); 263647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 263747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); 263847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); 263947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 264047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); 264147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 264247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 264347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 264447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 264547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); 264647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); 264747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); 264847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); 264947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); 265047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); 265147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); 265247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); 265347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 265447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 265547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 265647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 265747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 265847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 265947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 266047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 266147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 266247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 266347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 266447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 266547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 266647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 266747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 266847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 266947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 267047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 267147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 267247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 267347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 267447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 267547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 267647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 267747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 267847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 267947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 268047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 268147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 268247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 268347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 268447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 268547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 268647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 268747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 268847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[0] = s[0]; 268947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[1] = _mm_sub_epi16(kZero, s[8]); 269047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[2] = s[12]; 269147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[3] = _mm_sub_epi16(kZero, s[4]); 269247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[4] = _mm_packs_epi32(v[4], v[5]); 269347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[5] = _mm_packs_epi32(v[12], v[13]); 269447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[6] = _mm_packs_epi32(v[8], v[9]); 269547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[7] = _mm_packs_epi32(v[0], v[1]); 269647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[8] = _mm_packs_epi32(v[2], v[3]); 269747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[9] = _mm_packs_epi32(v[10], v[11]); 269847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[10] = _mm_packs_epi32(v[14], v[15]); 269947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[11] = _mm_packs_epi32(v[6], v[7]); 270047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[12] = s[5]; 270147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[13] = _mm_sub_epi16(kZero, s[13]); 270247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[14] = s[9]; 270347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org in[15] = _mm_sub_epi16(kZero, s[1]); 270447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org} 270547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 2706dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.orgvoid fdct16_sse2(__m128i *in0, __m128i *in1) { 2707dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org fdct16_8col(in0); 2708dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org fdct16_8col(in1); 270947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org array_transpose_16x16(in0, in1); 271047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org} 271147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 2712dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.orgvoid fadst16_sse2(__m128i *in0, __m128i *in1) { 2713dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org fadst16_8col(in0); 2714dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org fadst16_8col(in1); 271547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org array_transpose_16x16(in0, in1); 271647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org} 271747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 271876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.orgvoid vp9_fht16x16_sse2(const int16_t *input, int16_t *output, 271976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org int stride, int tx_type) { 272047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i in0[16], in1[16]; 272176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 272247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org switch (tx_type) { 272376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org case DCT_DCT: 272476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org vp9_fdct16x16_sse2(input, output, stride); 272547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org break; 272676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org case ADST_DCT: 272776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org load_buffer_16x16(input, in0, in1, stride); 2728dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org fadst16_sse2(in0, in1); 272947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org right_shift_16x16(in0, in1); 2730dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org fdct16_sse2(in0, in1); 273176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org write_buffer_16x16(output, in0, in1, 16); 273247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org break; 273376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org case DCT_ADST: 273476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org load_buffer_16x16(input, in0, in1, stride); 2735dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org fdct16_sse2(in0, in1); 273647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org right_shift_16x16(in0, in1); 2737dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org fadst16_sse2(in0, in1); 273876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org write_buffer_16x16(output, in0, in1, 16); 273947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org break; 274076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org case ADST_ADST: 274176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org load_buffer_16x16(input, in0, in1, stride); 2742dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org fadst16_sse2(in0, in1); 274347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org right_shift_16x16(in0, in1); 2744dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org fadst16_sse2(in0, in1); 274576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org write_buffer_16x16(output, in0, in1, 16); 274647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org break; 274747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org default: 274847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org assert(0); 274947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org break; 275047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org } 275147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org} 275247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 275388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.orgvoid vp9_fdct32x32_1_sse2(const int16_t *input, int16_t *output, int stride) { 275488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org __m128i in0, in1, in2, in3; 275588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org __m128i u0, u1; 275688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org __m128i sum = _mm_setzero_si128(); 275788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org int i; 275888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 275988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org for (i = 0; i < 8; ++i) { 276088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in0 = _mm_load_si128((const __m128i *)(input + 0)); 276188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in1 = _mm_load_si128((const __m128i *)(input + 8)); 276288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in2 = _mm_load_si128((const __m128i *)(input + 16)); 276388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in3 = _mm_load_si128((const __m128i *)(input + 24)); 276488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 276588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org input += stride; 276688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org u0 = _mm_add_epi16(in0, in1); 276788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org u1 = _mm_add_epi16(in2, in3); 276888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org sum = _mm_add_epi16(sum, u0); 276988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 277088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in0 = _mm_load_si128((const __m128i *)(input + 0)); 277188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in1 = _mm_load_si128((const __m128i *)(input + 8)); 277288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in2 = _mm_load_si128((const __m128i *)(input + 16)); 277388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in3 = _mm_load_si128((const __m128i *)(input + 24)); 277488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 277588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org input += stride; 277688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org sum = _mm_add_epi16(sum, u1); 277788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org u0 = _mm_add_epi16(in0, in1); 277888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org u1 = _mm_add_epi16(in2, in3); 277988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org sum = _mm_add_epi16(sum, u0); 278088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 278188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in0 = _mm_load_si128((const __m128i *)(input + 0)); 278288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in1 = _mm_load_si128((const __m128i *)(input + 8)); 278388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in2 = _mm_load_si128((const __m128i *)(input + 16)); 278488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in3 = _mm_load_si128((const __m128i *)(input + 24)); 278588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 278688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org input += stride; 278788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org sum = _mm_add_epi16(sum, u1); 278888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org u0 = _mm_add_epi16(in0, in1); 278988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org u1 = _mm_add_epi16(in2, in3); 279088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org sum = _mm_add_epi16(sum, u0); 279188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 279288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in0 = _mm_load_si128((const __m128i *)(input + 0)); 279388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in1 = _mm_load_si128((const __m128i *)(input + 8)); 279488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in2 = _mm_load_si128((const __m128i *)(input + 16)); 279588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in3 = _mm_load_si128((const __m128i *)(input + 24)); 279688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 279788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org input += stride; 279888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org sum = _mm_add_epi16(sum, u1); 279988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org u0 = _mm_add_epi16(in0, in1); 280088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org u1 = _mm_add_epi16(in2, in3); 280188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org sum = _mm_add_epi16(sum, u0); 280288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 280388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org sum = _mm_add_epi16(sum, u1); 280488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org } 280588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 280688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org u0 = _mm_setzero_si128(); 280788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in0 = _mm_unpacklo_epi16(u0, sum); 280888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in1 = _mm_unpackhi_epi16(u0, sum); 280988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in0 = _mm_srai_epi32(in0, 16); 281088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in1 = _mm_srai_epi32(in1, 16); 281188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 281288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org sum = _mm_add_epi32(in0, in1); 281388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in0 = _mm_unpacklo_epi32(sum, u0); 281488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in1 = _mm_unpackhi_epi32(sum, u0); 281588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 281688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org sum = _mm_add_epi32(in0, in1); 281788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in0 = _mm_srli_si128(sum, 8); 281888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 281988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in1 = _mm_add_epi32(sum, in0); 282088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org in1 = _mm_srai_epi32(in1, 3); 282188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org _mm_store_si128((__m128i *)(output), in1); 282288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org} 282388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org 2824ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org#define FDCT32x32_2D vp9_fdct32x32_rd_sse2 282553a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org#define FDCT32x32_HIGH_PRECISION 0 282653a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org#include "vp9/encoder/x86/vp9_dct32x32_sse2.c" 282753a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org#undef FDCT32x32_HIGH_PRECISION 2828ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org#undef FDCT32x32_2D 282953a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org 2830ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org#define FDCT32x32_2D vp9_fdct32x32_sse2 283153a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org#define FDCT32x32_HIGH_PRECISION 1 283253a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org#include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT 283353a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org#undef FDCT32x32_HIGH_PRECISION 2834ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org#undef FDCT32x32_2D 2835