1/* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "./vpx_dsp_rtcd.h" 12#include "vpx_dsp/x86/inv_txfm_sse2.h" 13#include "vpx_dsp/x86/transpose_sse2.h" 14#include "vpx_dsp/x86/txfm_common_sse2.h" 15 16void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, 17 int stride) { 18 const __m128i eight = _mm_set1_epi16(8); 19 __m128i in[2]; 20 21 // Rows 22 in[0] = load_input_data(input); 23 in[1] = load_input_data(input + 8); 24 idct4_sse2(in); 25 26 // Columns 27 idct4_sse2(in); 28 29 // Final round and shift 30 in[0] = _mm_add_epi16(in[0], eight); 31 in[1] = _mm_add_epi16(in[1], eight); 32 in[0] = _mm_srai_epi16(in[0], 4); 33 in[1] = _mm_srai_epi16(in[1], 4); 34 35 recon_and_store4x4_sse2(in, dest, stride); 36} 37 38void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, 39 int stride) { 40 const __m128i zero = _mm_setzero_si128(); 41 int a; 42 __m128i dc_value, d[2]; 43 44 a = (int)dct_const_round_shift(input[0] * cospi_16_64); 45 a = (int)dct_const_round_shift(a * cospi_16_64); 46 a = ROUND_POWER_OF_TWO(a, 4); 47 48 dc_value = _mm_set1_epi16(a); 49 50 // Reconstruction and Store 51 d[0] = _mm_cvtsi32_si128(*(const int *)(dest)); 52 d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)); 53 d[0] = _mm_unpacklo_epi32(d[0], 54 _mm_cvtsi32_si128(*(const int *)(dest + stride))); 55 d[1] = _mm_unpacklo_epi32( 56 _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]); 57 d[0] = _mm_unpacklo_epi8(d[0], zero); 58 d[1] = _mm_unpacklo_epi8(d[1], zero); 59 d[0] = _mm_add_epi16(d[0], dc_value); 60 d[1] = _mm_add_epi16(d[1], dc_value); 61 d[0] = _mm_packus_epi16(d[0], d[1]); 62 63 *(int *)dest = _mm_cvtsi128_si32(d[0]); 64 d[0] = _mm_srli_si128(d[0], 4); 65 *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]); 66 d[0] = _mm_srli_si128(d[0], 4); 67 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]); 68 d[0] = _mm_srli_si128(d[0], 4); 69 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]); 70} 71 72void idct4_sse2(__m128i *in) { 73 const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); 74 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 75 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 76 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 77 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 78 __m128i u[8], v[8]; 79 80 transpose_16bit_4x4(in); 81 // stage 1 82 u[0] = _mm_unpacklo_epi16(in[0], in[1]); 83 u[1] = _mm_unpackhi_epi16(in[0], in[1]); 84 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 85 v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 86 v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); 87 v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24); 88 89 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 90 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 91 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 92 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 93 94 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 95 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 96 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 97 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 98 99 u[0] = _mm_packs_epi32(v[0], v[1]); 100 u[1] = _mm_packs_epi32(v[3], v[2]); 101 102 // stage 2 103 in[0] = _mm_add_epi16(u[0], u[1]); 104 in[1] = _mm_sub_epi16(u[0], u[1]); 105 in[1] = _mm_shuffle_epi32(in[1], 0x4E); 106} 107 108void iadst4_sse2(__m128i *in) { 109 const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); 110 const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); 111 const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); 112 const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); 113 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); 114 const __m128i kZero = _mm_set1_epi16(0); 115 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 116 __m128i u[8], v[8], in7; 117 118 transpose_16bit_4x4(in); 119 in7 = _mm_srli_si128(in[1], 8); 120 in7 = _mm_add_epi16(in7, in[0]); 121 in7 = _mm_sub_epi16(in7, in[1]); 122 123 u[0] = _mm_unpacklo_epi16(in[0], in[1]); 124 u[1] = _mm_unpackhi_epi16(in[0], in[1]); 125 u[2] = _mm_unpacklo_epi16(in7, kZero); 126 u[3] = _mm_unpackhi_epi16(in[0], kZero); 127 128 v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 129 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 130 v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2 131 v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4 132 v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6 133 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2 134 135 u[0] = _mm_add_epi32(v[0], v[1]); 136 u[1] = _mm_add_epi32(v[3], v[4]); 137 u[2] = v[2]; 138 u[3] = _mm_add_epi32(u[0], u[1]); 139 u[4] = _mm_slli_epi32(v[5], 2); 140 u[5] = _mm_add_epi32(u[3], v[5]); 141 u[6] = _mm_sub_epi32(u[5], u[4]); 142 143 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 144 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 145 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 146 v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 147 148 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 149 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 150 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 151 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 152 153 in[0] = _mm_packs_epi32(u[0], u[1]); 154 in[1] = _mm_packs_epi32(u[2], u[3]); 155} 156 157#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \ 158 { \ 159 tmp0 = _mm_madd_epi16(lo_0, cst0); \ 160 tmp1 = _mm_madd_epi16(hi_0, cst0); \ 161 tmp2 = _mm_madd_epi16(lo_0, cst1); \ 162 tmp3 = _mm_madd_epi16(hi_0, cst1); \ 163 \ 164 tmp0 = _mm_add_epi32(tmp0, rounding); \ 165 tmp1 = _mm_add_epi32(tmp1, rounding); \ 166 tmp2 = _mm_add_epi32(tmp2, rounding); \ 167 tmp3 = _mm_add_epi32(tmp3, rounding); \ 168 \ 169 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 170 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 171 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 172 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 173 \ 174 res0 = _mm_packs_epi32(tmp0, tmp1); \ 175 res1 = _mm_packs_epi32(tmp2, tmp3); \ 176 } 177 178#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \ 179 out4, out5, out6, out7) \ 180 { \ 181 /* Stage1 */ \ 182 { \ 183 const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ 184 const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ 185 const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ 186 const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ 187 \ 188 MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1, \ 189 stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6) \ 190 } \ 191 \ 192 /* Stage2 */ \ 193 { \ 194 const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ 195 const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ 196 const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ 197 const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ 198 \ 199 MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1, \ 200 stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3) \ 201 \ 202 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ 203 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ 204 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ 205 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ 206 } \ 207 \ 208 /* Stage3 */ \ 209 { \ 210 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 211 const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 212 \ 213 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ 214 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ 215 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ 216 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ 217 \ 218 tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ 219 tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ 220 tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ 221 tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ 222 \ 223 tmp0 = _mm_add_epi32(tmp0, rounding); \ 224 tmp1 = _mm_add_epi32(tmp1, rounding); \ 225 tmp2 = _mm_add_epi32(tmp2, rounding); \ 226 tmp3 = _mm_add_epi32(tmp3, rounding); \ 227 \ 228 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 229 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 230 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 231 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 232 \ 233 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 234 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 235 } \ 236 \ 237 /* Stage4 */ \ 238 out0 = _mm_add_epi16(stp1_0, stp2_7); \ 239 out1 = _mm_add_epi16(stp1_1, stp1_6); \ 240 out2 = _mm_add_epi16(stp1_2, stp1_5); \ 241 out3 = _mm_add_epi16(stp1_3, stp2_4); \ 242 out4 = _mm_sub_epi16(stp1_3, stp2_4); \ 243 out5 = _mm_sub_epi16(stp1_2, stp1_5); \ 244 out6 = _mm_sub_epi16(stp1_1, stp1_6); \ 245 out7 = _mm_sub_epi16(stp1_0, stp2_7); \ 246 } 247 248void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, 249 int stride) { 250 const __m128i zero = _mm_setzero_si128(); 251 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 252 const __m128i final_rounding = _mm_set1_epi16(1 << 4); 253 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 254 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 255 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 256 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 257 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 258 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 259 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 260 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 261 262 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 263 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 264 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 265 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 266 int i; 267 268 // Load input data. 269 in0 = load_input_data(input); 270 in1 = load_input_data(input + 8 * 1); 271 in2 = load_input_data(input + 8 * 2); 272 in3 = load_input_data(input + 8 * 3); 273 in4 = load_input_data(input + 8 * 4); 274 in5 = load_input_data(input + 8 * 5); 275 in6 = load_input_data(input + 8 * 6); 276 in7 = load_input_data(input + 8 * 7); 277 278 // 2-D 279 for (i = 0; i < 2; i++) { 280 // 8x8 Transpose is copied from vpx_fdct8x8_sse2() 281 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 282 in4, in5, in6, in7); 283 284 // 4-stage 1D idct8x8 285 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, 286 in6, in7); 287 } 288 289 // Final rounding and shift 290 in0 = _mm_adds_epi16(in0, final_rounding); 291 in1 = _mm_adds_epi16(in1, final_rounding); 292 in2 = _mm_adds_epi16(in2, final_rounding); 293 in3 = _mm_adds_epi16(in3, final_rounding); 294 in4 = _mm_adds_epi16(in4, final_rounding); 295 in5 = _mm_adds_epi16(in5, final_rounding); 296 in6 = _mm_adds_epi16(in6, final_rounding); 297 in7 = _mm_adds_epi16(in7, final_rounding); 298 299 in0 = _mm_srai_epi16(in0, 5); 300 in1 = _mm_srai_epi16(in1, 5); 301 in2 = _mm_srai_epi16(in2, 5); 302 in3 = _mm_srai_epi16(in3, 5); 303 in4 = _mm_srai_epi16(in4, 5); 304 in5 = _mm_srai_epi16(in5, 5); 305 in6 = _mm_srai_epi16(in6, 5); 306 in7 = _mm_srai_epi16(in7, 5); 307 308 RECON_AND_STORE(dest + 0 * stride, in0); 309 RECON_AND_STORE(dest + 1 * stride, in1); 310 RECON_AND_STORE(dest + 2 * stride, in2); 311 RECON_AND_STORE(dest + 3 * stride, in3); 312 RECON_AND_STORE(dest + 4 * stride, in4); 313 RECON_AND_STORE(dest + 5 * stride, in5); 314 RECON_AND_STORE(dest + 6 * stride, in6); 315 RECON_AND_STORE(dest + 7 * stride, in7); 316} 317 318void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, 319 int stride) { 320 __m128i dc_value; 321 const __m128i zero = _mm_setzero_si128(); 322 int a; 323 324 a = (int)dct_const_round_shift(input[0] * cospi_16_64); 325 a = (int)dct_const_round_shift(a * cospi_16_64); 326 a = ROUND_POWER_OF_TWO(a, 5); 327 328 dc_value = _mm_set1_epi16(a); 329 330 RECON_AND_STORE(dest + 0 * stride, dc_value); 331 RECON_AND_STORE(dest + 1 * stride, dc_value); 332 RECON_AND_STORE(dest + 2 * stride, dc_value); 333 RECON_AND_STORE(dest + 3 * stride, dc_value); 334 RECON_AND_STORE(dest + 4 * stride, dc_value); 335 RECON_AND_STORE(dest + 5 * stride, dc_value); 336 RECON_AND_STORE(dest + 6 * stride, dc_value); 337 RECON_AND_STORE(dest + 7 * stride, dc_value); 338} 339 340void idct8_sse2(__m128i *in) { 341 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 342 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 343 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 344 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 345 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 346 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 347 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 348 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 349 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 350 351 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 352 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 353 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 354 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 355 356 // 8x8 Transpose is copied from vpx_fdct8x8_sse2() 357 TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0, 358 in1, in2, in3, in4, in5, in6, in7); 359 360 // 4-stage 1D idct8x8 361 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3], 362 in[4], in[5], in[6], in[7]); 363} 364 365void iadst8_sse2(__m128i *in) { 366 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); 367 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); 368 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); 369 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); 370 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); 371 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); 372 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); 373 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); 374 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 375 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 376 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 377 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 378 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); 379 const __m128i k__const_0 = _mm_set1_epi16(0); 380 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 381 382 __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; 383 __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; 384 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; 385 __m128i s0, s1, s2, s3, s4, s5, s6, s7; 386 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 387 388 // transpose 389 array_transpose_8x8(in, in); 390 391 // properly aligned for butterfly input 392 in0 = in[7]; 393 in1 = in[0]; 394 in2 = in[5]; 395 in3 = in[2]; 396 in4 = in[3]; 397 in5 = in[4]; 398 in6 = in[1]; 399 in7 = in[6]; 400 401 // column transformation 402 // stage 1 403 // interleave and multiply/add into 32-bit integer 404 s0 = _mm_unpacklo_epi16(in0, in1); 405 s1 = _mm_unpackhi_epi16(in0, in1); 406 s2 = _mm_unpacklo_epi16(in2, in3); 407 s3 = _mm_unpackhi_epi16(in2, in3); 408 s4 = _mm_unpacklo_epi16(in4, in5); 409 s5 = _mm_unpackhi_epi16(in4, in5); 410 s6 = _mm_unpacklo_epi16(in6, in7); 411 s7 = _mm_unpackhi_epi16(in6, in7); 412 413 u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); 414 u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); 415 u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); 416 u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); 417 u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); 418 u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); 419 u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); 420 u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); 421 u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); 422 u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); 423 u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); 424 u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); 425 u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); 426 u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); 427 u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); 428 u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); 429 430 // addition 431 w0 = _mm_add_epi32(u0, u8); 432 w1 = _mm_add_epi32(u1, u9); 433 w2 = _mm_add_epi32(u2, u10); 434 w3 = _mm_add_epi32(u3, u11); 435 w4 = _mm_add_epi32(u4, u12); 436 w5 = _mm_add_epi32(u5, u13); 437 w6 = _mm_add_epi32(u6, u14); 438 w7 = _mm_add_epi32(u7, u15); 439 w8 = _mm_sub_epi32(u0, u8); 440 w9 = _mm_sub_epi32(u1, u9); 441 w10 = _mm_sub_epi32(u2, u10); 442 w11 = _mm_sub_epi32(u3, u11); 443 w12 = _mm_sub_epi32(u4, u12); 444 w13 = _mm_sub_epi32(u5, u13); 445 w14 = _mm_sub_epi32(u6, u14); 446 w15 = _mm_sub_epi32(u7, u15); 447 448 // shift and rounding 449 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 450 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 451 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 452 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 453 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 454 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 455 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 456 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 457 v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); 458 v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); 459 v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); 460 v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); 461 v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); 462 v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); 463 v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); 464 v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); 465 466 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 467 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 468 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 469 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 470 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 471 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 472 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 473 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 474 u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); 475 u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); 476 u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); 477 u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); 478 u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); 479 u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); 480 u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); 481 u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); 482 483 // back to 16-bit and pack 8 integers into __m128i 484 in[0] = _mm_packs_epi32(u0, u1); 485 in[1] = _mm_packs_epi32(u2, u3); 486 in[2] = _mm_packs_epi32(u4, u5); 487 in[3] = _mm_packs_epi32(u6, u7); 488 in[4] = _mm_packs_epi32(u8, u9); 489 in[5] = _mm_packs_epi32(u10, u11); 490 in[6] = _mm_packs_epi32(u12, u13); 491 in[7] = _mm_packs_epi32(u14, u15); 492 493 // stage 2 494 s0 = _mm_add_epi16(in[0], in[2]); 495 s1 = _mm_add_epi16(in[1], in[3]); 496 s2 = _mm_sub_epi16(in[0], in[2]); 497 s3 = _mm_sub_epi16(in[1], in[3]); 498 u0 = _mm_unpacklo_epi16(in[4], in[5]); 499 u1 = _mm_unpackhi_epi16(in[4], in[5]); 500 u2 = _mm_unpacklo_epi16(in[6], in[7]); 501 u3 = _mm_unpackhi_epi16(in[6], in[7]); 502 503 v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); 504 v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); 505 v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); 506 v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); 507 v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); 508 v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); 509 v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); 510 v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); 511 512 w0 = _mm_add_epi32(v0, v4); 513 w1 = _mm_add_epi32(v1, v5); 514 w2 = _mm_add_epi32(v2, v6); 515 w3 = _mm_add_epi32(v3, v7); 516 w4 = _mm_sub_epi32(v0, v4); 517 w5 = _mm_sub_epi32(v1, v5); 518 w6 = _mm_sub_epi32(v2, v6); 519 w7 = _mm_sub_epi32(v3, v7); 520 521 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 522 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 523 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 524 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 525 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 526 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 527 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 528 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 529 530 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 531 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 532 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 533 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 534 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 535 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 536 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 537 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 538 539 // back to 16-bit intergers 540 s4 = _mm_packs_epi32(u0, u1); 541 s5 = _mm_packs_epi32(u2, u3); 542 s6 = _mm_packs_epi32(u4, u5); 543 s7 = _mm_packs_epi32(u6, u7); 544 545 // stage 3 546 u0 = _mm_unpacklo_epi16(s2, s3); 547 u1 = _mm_unpackhi_epi16(s2, s3); 548 u2 = _mm_unpacklo_epi16(s6, s7); 549 u3 = _mm_unpackhi_epi16(s6, s7); 550 551 v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); 552 v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); 553 v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); 554 v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); 555 v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); 556 v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); 557 v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); 558 v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); 559 560 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 561 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 562 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 563 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 564 u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); 565 u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); 566 u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); 567 u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); 568 569 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 570 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 571 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 572 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 573 v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); 574 v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); 575 v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); 576 v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); 577 578 s2 = _mm_packs_epi32(v0, v1); 579 s3 = _mm_packs_epi32(v2, v3); 580 s6 = _mm_packs_epi32(v4, v5); 581 s7 = _mm_packs_epi32(v6, v7); 582 583 in[0] = s0; 584 in[1] = _mm_sub_epi16(k__const_0, s4); 585 in[2] = s6; 586 in[3] = _mm_sub_epi16(k__const_0, s2); 587 in[4] = s3; 588 in[5] = _mm_sub_epi16(k__const_0, s7); 589 in[6] = s5; 590 in[7] = _mm_sub_epi16(k__const_0, s1); 591} 592 593void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, 594 int stride) { 595 const __m128i zero = _mm_setzero_si128(); 596 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 597 const __m128i final_rounding = _mm_set1_epi16(1 << 4); 598 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 599 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 600 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 601 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 602 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 603 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 604 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 605 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 606 const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 607 608 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 609 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 610 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 611 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 612 613 // Rows. Load 4-row input data. 614 in0 = load_input_data(input); 615 in1 = load_input_data(input + 8 * 1); 616 in2 = load_input_data(input + 8 * 2); 617 in3 = load_input_data(input + 8 * 3); 618 619 // 8x4 Transpose 620 TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); 621 // Stage1 622 { 623 const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero); 624 const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero); 625 626 tmp0 = _mm_madd_epi16(lo_17, stg1_0); 627 tmp2 = _mm_madd_epi16(lo_17, stg1_1); 628 tmp4 = _mm_madd_epi16(lo_35, stg1_2); 629 tmp6 = _mm_madd_epi16(lo_35, stg1_3); 630 631 tmp0 = _mm_add_epi32(tmp0, rounding); 632 tmp2 = _mm_add_epi32(tmp2, rounding); 633 tmp4 = _mm_add_epi32(tmp4, rounding); 634 tmp6 = _mm_add_epi32(tmp6, rounding); 635 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 636 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 637 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 638 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 639 640 stp1_4 = _mm_packs_epi32(tmp0, tmp2); 641 stp1_5 = _mm_packs_epi32(tmp4, tmp6); 642 } 643 644 // Stage2 645 { 646 const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero); 647 const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero); 648 649 tmp0 = _mm_madd_epi16(lo_04, stg2_0); 650 tmp2 = _mm_madd_epi16(lo_04, stg2_1); 651 tmp4 = _mm_madd_epi16(lo_26, stg2_2); 652 tmp6 = _mm_madd_epi16(lo_26, stg2_3); 653 654 tmp0 = _mm_add_epi32(tmp0, rounding); 655 tmp2 = _mm_add_epi32(tmp2, rounding); 656 tmp4 = _mm_add_epi32(tmp4, rounding); 657 tmp6 = _mm_add_epi32(tmp6, rounding); 658 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 659 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 660 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 661 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 662 663 stp2_0 = _mm_packs_epi32(tmp0, tmp2); 664 stp2_2 = _mm_packs_epi32(tmp6, tmp4); 665 666 tmp0 = _mm_add_epi16(stp1_4, stp1_5); 667 tmp1 = _mm_sub_epi16(stp1_4, stp1_5); 668 669 stp2_4 = tmp0; 670 stp2_5 = _mm_unpacklo_epi64(tmp1, zero); 671 stp2_6 = _mm_unpackhi_epi64(tmp1, zero); 672 } 673 674 // Stage3 675 { 676 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); 677 678 tmp4 = _mm_add_epi16(stp2_0, stp2_2); 679 tmp6 = _mm_sub_epi16(stp2_0, stp2_2); 680 681 stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4); 682 stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4); 683 684 tmp0 = _mm_madd_epi16(lo_56, stg3_0); 685 tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 686 687 tmp0 = _mm_add_epi32(tmp0, rounding); 688 tmp2 = _mm_add_epi32(tmp2, rounding); 689 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 690 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 691 692 stp1_5 = _mm_packs_epi32(tmp0, tmp2); 693 } 694 695 // Stage4 696 tmp0 = _mm_add_epi16(stp1_3, stp2_4); 697 tmp1 = _mm_add_epi16(stp1_2, stp1_5); 698 tmp2 = _mm_sub_epi16(stp1_3, stp2_4); 699 tmp3 = _mm_sub_epi16(stp1_2, stp1_5); 700 701 TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) 702 703 IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4, 704 in5, in6, in7); 705 // Final rounding and shift 706 in0 = _mm_adds_epi16(in0, final_rounding); 707 in1 = _mm_adds_epi16(in1, final_rounding); 708 in2 = _mm_adds_epi16(in2, final_rounding); 709 in3 = _mm_adds_epi16(in3, final_rounding); 710 in4 = _mm_adds_epi16(in4, final_rounding); 711 in5 = _mm_adds_epi16(in5, final_rounding); 712 in6 = _mm_adds_epi16(in6, final_rounding); 713 in7 = _mm_adds_epi16(in7, final_rounding); 714 715 in0 = _mm_srai_epi16(in0, 5); 716 in1 = _mm_srai_epi16(in1, 5); 717 in2 = _mm_srai_epi16(in2, 5); 718 in3 = _mm_srai_epi16(in3, 5); 719 in4 = _mm_srai_epi16(in4, 5); 720 in5 = _mm_srai_epi16(in5, 5); 721 in6 = _mm_srai_epi16(in6, 5); 722 in7 = _mm_srai_epi16(in7, 5); 723 724 RECON_AND_STORE(dest + 0 * stride, in0); 725 RECON_AND_STORE(dest + 1 * stride, in1); 726 RECON_AND_STORE(dest + 2 * stride, in2); 727 RECON_AND_STORE(dest + 3 * stride, in3); 728 RECON_AND_STORE(dest + 4 * stride, in4); 729 RECON_AND_STORE(dest + 5 * stride, in5); 730 RECON_AND_STORE(dest + 6 * stride, in6); 731 RECON_AND_STORE(dest + 7 * stride, in7); 732} 733 734#define IDCT16 \ 735 /* Stage2 */ \ 736 { \ 737 const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \ 738 const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \ 739 const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \ 740 const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \ 741 const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \ 742 const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \ 743 const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \ 744 const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \ 745 \ 746 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1, \ 747 stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14) \ 748 \ 749 MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \ 750 stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \ 751 } \ 752 \ 753 /* Stage3 */ \ 754 { \ 755 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \ 756 const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \ 757 const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \ 758 const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \ 759 \ 760 MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \ 761 stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6) \ 762 \ 763 stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ 764 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ 765 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ 766 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ 767 \ 768 stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ 769 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ 770 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ 771 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ 772 } \ 773 \ 774 /* Stage4 */ \ 775 { \ 776 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \ 777 const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \ 778 const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \ 779 const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \ 780 \ 781 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ 782 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ 783 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 784 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 785 \ 786 MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1, \ 787 stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \ 788 \ 789 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ 790 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ 791 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ 792 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ 793 \ 794 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ 795 stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \ 796 stp2_13) \ 797 } \ 798 \ 799 /* Stage5 */ \ 800 { \ 801 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 802 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 803 \ 804 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ 805 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ 806 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ 807 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ 808 \ 809 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 810 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 811 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 812 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 813 \ 814 tmp0 = _mm_add_epi32(tmp0, rounding); \ 815 tmp1 = _mm_add_epi32(tmp1, rounding); \ 816 tmp2 = _mm_add_epi32(tmp2, rounding); \ 817 tmp3 = _mm_add_epi32(tmp3, rounding); \ 818 \ 819 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 820 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 821 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 822 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 823 \ 824 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 825 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 826 \ 827 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ 828 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 829 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 830 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ 831 \ 832 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ 833 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 834 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 835 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ 836 } \ 837 \ 838 /* Stage6 */ \ 839 { \ 840 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 841 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 842 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 843 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 844 \ 845 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ 846 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 847 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 848 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ 849 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ 850 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 851 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 852 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ 853 \ 854 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ 855 stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ 856 stp2_12) \ 857 } 858 859#define IDCT16_10 \ 860 /* Stage2 */ \ 861 { \ 862 const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \ 863 const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \ 864 const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \ 865 const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \ 866 \ 867 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \ 868 stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11, \ 869 stp1_12_0) \ 870 } \ 871 \ 872 /* Stage3 */ \ 873 { \ 874 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \ 875 const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \ 876 \ 877 MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \ 878 \ 879 stp1_9 = stp1_8_0; \ 880 stp1_10 = stp1_11; \ 881 \ 882 stp1_13 = stp1_12_0; \ 883 stp1_14 = stp1_15; \ 884 } \ 885 \ 886 /* Stage4 */ \ 887 { \ 888 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \ 889 const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \ 890 \ 891 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ 892 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ 893 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 894 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 895 \ 896 MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1) \ 897 stp2_5 = stp2_4; \ 898 stp2_6 = stp2_7; \ 899 \ 900 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ 901 stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \ 902 stp2_13) \ 903 } \ 904 \ 905 /* Stage5 */ \ 906 { \ 907 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 908 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 909 \ 910 stp1_2 = stp1_1; \ 911 stp1_3 = stp1_0; \ 912 \ 913 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 914 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 915 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 916 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 917 \ 918 tmp0 = _mm_add_epi32(tmp0, rounding); \ 919 tmp1 = _mm_add_epi32(tmp1, rounding); \ 920 tmp2 = _mm_add_epi32(tmp2, rounding); \ 921 tmp3 = _mm_add_epi32(tmp3, rounding); \ 922 \ 923 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 924 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 925 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 926 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 927 \ 928 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 929 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 930 \ 931 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ 932 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 933 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 934 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ 935 \ 936 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ 937 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 938 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 939 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ 940 } \ 941 \ 942 /* Stage6 */ \ 943 { \ 944 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 945 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 946 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 947 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 948 \ 949 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ 950 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 951 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 952 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ 953 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ 954 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 955 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 956 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ 957 \ 958 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ 959 stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ 960 stp2_12) \ 961 } 962 963void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, 964 int stride) { 965 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 966 const __m128i final_rounding = _mm_set1_epi16(1 << 5); 967 const __m128i zero = _mm_setzero_si128(); 968 969 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 970 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 971 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 972 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 973 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 974 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 975 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 976 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 977 978 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 979 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 980 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 981 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 982 983 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 984 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 985 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 986 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 987 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 988 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 989 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 990 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); 991 992 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 993 994 __m128i in[16], l[16], r[16], *curr1; 995 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 996 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 997 stp1_8_0, stp1_12_0; 998 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 999 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; 1000 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 1001 int i; 1002 1003 curr1 = l; 1004 for (i = 0; i < 2; i++) { 1005 // 1-D idct 1006 1007 // Load input data. 1008 in[0] = load_input_data(input); 1009 in[8] = load_input_data(input + 8 * 1); 1010 in[1] = load_input_data(input + 8 * 2); 1011 in[9] = load_input_data(input + 8 * 3); 1012 in[2] = load_input_data(input + 8 * 4); 1013 in[10] = load_input_data(input + 8 * 5); 1014 in[3] = load_input_data(input + 8 * 6); 1015 in[11] = load_input_data(input + 8 * 7); 1016 in[4] = load_input_data(input + 8 * 8); 1017 in[12] = load_input_data(input + 8 * 9); 1018 in[5] = load_input_data(input + 8 * 10); 1019 in[13] = load_input_data(input + 8 * 11); 1020 in[6] = load_input_data(input + 8 * 12); 1021 in[14] = load_input_data(input + 8 * 13); 1022 in[7] = load_input_data(input + 8 * 14); 1023 in[15] = load_input_data(input + 8 * 15); 1024 1025 array_transpose_8x8(in, in); 1026 array_transpose_8x8(in + 8, in + 8); 1027 1028 IDCT16 1029 1030 // Stage7 1031 curr1[0] = _mm_add_epi16(stp2_0, stp1_15); 1032 curr1[1] = _mm_add_epi16(stp2_1, stp1_14); 1033 curr1[2] = _mm_add_epi16(stp2_2, stp2_13); 1034 curr1[3] = _mm_add_epi16(stp2_3, stp2_12); 1035 curr1[4] = _mm_add_epi16(stp2_4, stp2_11); 1036 curr1[5] = _mm_add_epi16(stp2_5, stp2_10); 1037 curr1[6] = _mm_add_epi16(stp2_6, stp1_9); 1038 curr1[7] = _mm_add_epi16(stp2_7, stp1_8); 1039 curr1[8] = _mm_sub_epi16(stp2_7, stp1_8); 1040 curr1[9] = _mm_sub_epi16(stp2_6, stp1_9); 1041 curr1[10] = _mm_sub_epi16(stp2_5, stp2_10); 1042 curr1[11] = _mm_sub_epi16(stp2_4, stp2_11); 1043 curr1[12] = _mm_sub_epi16(stp2_3, stp2_12); 1044 curr1[13] = _mm_sub_epi16(stp2_2, stp2_13); 1045 curr1[14] = _mm_sub_epi16(stp2_1, stp1_14); 1046 curr1[15] = _mm_sub_epi16(stp2_0, stp1_15); 1047 1048 curr1 = r; 1049 input += 128; 1050 } 1051 for (i = 0; i < 2; i++) { 1052 int j; 1053 // 1-D idct 1054 array_transpose_8x8(l + i * 8, in); 1055 array_transpose_8x8(r + i * 8, in + 8); 1056 1057 IDCT16 1058 1059 // 2-D 1060 in[0] = _mm_add_epi16(stp2_0, stp1_15); 1061 in[1] = _mm_add_epi16(stp2_1, stp1_14); 1062 in[2] = _mm_add_epi16(stp2_2, stp2_13); 1063 in[3] = _mm_add_epi16(stp2_3, stp2_12); 1064 in[4] = _mm_add_epi16(stp2_4, stp2_11); 1065 in[5] = _mm_add_epi16(stp2_5, stp2_10); 1066 in[6] = _mm_add_epi16(stp2_6, stp1_9); 1067 in[7] = _mm_add_epi16(stp2_7, stp1_8); 1068 in[8] = _mm_sub_epi16(stp2_7, stp1_8); 1069 in[9] = _mm_sub_epi16(stp2_6, stp1_9); 1070 in[10] = _mm_sub_epi16(stp2_5, stp2_10); 1071 in[11] = _mm_sub_epi16(stp2_4, stp2_11); 1072 in[12] = _mm_sub_epi16(stp2_3, stp2_12); 1073 in[13] = _mm_sub_epi16(stp2_2, stp2_13); 1074 in[14] = _mm_sub_epi16(stp2_1, stp1_14); 1075 in[15] = _mm_sub_epi16(stp2_0, stp1_15); 1076 1077 for (j = 0; j < 16; ++j) { 1078 // Final rounding and shift 1079 in[j] = _mm_adds_epi16(in[j], final_rounding); 1080 in[j] = _mm_srai_epi16(in[j], 6); 1081 RECON_AND_STORE(dest + j * stride, in[j]); 1082 } 1083 1084 dest += 8; 1085 } 1086} 1087 1088void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, 1089 int stride) { 1090 __m128i dc_value; 1091 const __m128i zero = _mm_setzero_si128(); 1092 int a, i; 1093 1094 a = (int)dct_const_round_shift(input[0] * cospi_16_64); 1095 a = (int)dct_const_round_shift(a * cospi_16_64); 1096 a = ROUND_POWER_OF_TWO(a, 6); 1097 1098 dc_value = _mm_set1_epi16(a); 1099 1100 for (i = 0; i < 16; ++i) { 1101 RECON_AND_STORE(dest + 0, dc_value); 1102 RECON_AND_STORE(dest + 8, dc_value); 1103 dest += stride; 1104 } 1105} 1106 1107static void iadst16_8col(__m128i *in) { 1108 // perform 16x16 1-D ADST for 8 columns 1109 __m128i s[16], x[16], u[32], v[32]; 1110 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); 1111 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); 1112 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); 1113 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); 1114 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); 1115 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); 1116 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); 1117 const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); 1118 const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); 1119 const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); 1120 const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); 1121 const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); 1122 const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); 1123 const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); 1124 const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); 1125 const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); 1126 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); 1127 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1128 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); 1129 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1130 const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); 1131 const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); 1132 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 1133 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1134 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 1135 const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64); 1136 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); 1137 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1138 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1139 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1140 const __m128i kZero = _mm_set1_epi16(0); 1141 1142 u[0] = _mm_unpacklo_epi16(in[15], in[0]); 1143 u[1] = _mm_unpackhi_epi16(in[15], in[0]); 1144 u[2] = _mm_unpacklo_epi16(in[13], in[2]); 1145 u[3] = _mm_unpackhi_epi16(in[13], in[2]); 1146 u[4] = _mm_unpacklo_epi16(in[11], in[4]); 1147 u[5] = _mm_unpackhi_epi16(in[11], in[4]); 1148 u[6] = _mm_unpacklo_epi16(in[9], in[6]); 1149 u[7] = _mm_unpackhi_epi16(in[9], in[6]); 1150 u[8] = _mm_unpacklo_epi16(in[7], in[8]); 1151 u[9] = _mm_unpackhi_epi16(in[7], in[8]); 1152 u[10] = _mm_unpacklo_epi16(in[5], in[10]); 1153 u[11] = _mm_unpackhi_epi16(in[5], in[10]); 1154 u[12] = _mm_unpacklo_epi16(in[3], in[12]); 1155 u[13] = _mm_unpackhi_epi16(in[3], in[12]); 1156 u[14] = _mm_unpacklo_epi16(in[1], in[14]); 1157 u[15] = _mm_unpackhi_epi16(in[1], in[14]); 1158 1159 v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); 1160 v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); 1161 v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); 1162 v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); 1163 v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); 1164 v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); 1165 v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); 1166 v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); 1167 v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); 1168 v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); 1169 v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); 1170 v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); 1171 v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); 1172 v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); 1173 v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); 1174 v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); 1175 v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); 1176 v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); 1177 v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); 1178 v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); 1179 v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); 1180 v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); 1181 v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); 1182 v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); 1183 v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); 1184 v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); 1185 v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); 1186 v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); 1187 v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); 1188 v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); 1189 v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); 1190 v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); 1191 1192 u[0] = _mm_add_epi32(v[0], v[16]); 1193 u[1] = _mm_add_epi32(v[1], v[17]); 1194 u[2] = _mm_add_epi32(v[2], v[18]); 1195 u[3] = _mm_add_epi32(v[3], v[19]); 1196 u[4] = _mm_add_epi32(v[4], v[20]); 1197 u[5] = _mm_add_epi32(v[5], v[21]); 1198 u[6] = _mm_add_epi32(v[6], v[22]); 1199 u[7] = _mm_add_epi32(v[7], v[23]); 1200 u[8] = _mm_add_epi32(v[8], v[24]); 1201 u[9] = _mm_add_epi32(v[9], v[25]); 1202 u[10] = _mm_add_epi32(v[10], v[26]); 1203 u[11] = _mm_add_epi32(v[11], v[27]); 1204 u[12] = _mm_add_epi32(v[12], v[28]); 1205 u[13] = _mm_add_epi32(v[13], v[29]); 1206 u[14] = _mm_add_epi32(v[14], v[30]); 1207 u[15] = _mm_add_epi32(v[15], v[31]); 1208 u[16] = _mm_sub_epi32(v[0], v[16]); 1209 u[17] = _mm_sub_epi32(v[1], v[17]); 1210 u[18] = _mm_sub_epi32(v[2], v[18]); 1211 u[19] = _mm_sub_epi32(v[3], v[19]); 1212 u[20] = _mm_sub_epi32(v[4], v[20]); 1213 u[21] = _mm_sub_epi32(v[5], v[21]); 1214 u[22] = _mm_sub_epi32(v[6], v[22]); 1215 u[23] = _mm_sub_epi32(v[7], v[23]); 1216 u[24] = _mm_sub_epi32(v[8], v[24]); 1217 u[25] = _mm_sub_epi32(v[9], v[25]); 1218 u[26] = _mm_sub_epi32(v[10], v[26]); 1219 u[27] = _mm_sub_epi32(v[11], v[27]); 1220 u[28] = _mm_sub_epi32(v[12], v[28]); 1221 u[29] = _mm_sub_epi32(v[13], v[29]); 1222 u[30] = _mm_sub_epi32(v[14], v[30]); 1223 u[31] = _mm_sub_epi32(v[15], v[31]); 1224 1225 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1226 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1227 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1228 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1229 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1230 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1231 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1232 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1233 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1234 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1235 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1236 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1237 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1238 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1239 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1240 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1241 v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); 1242 v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); 1243 v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); 1244 v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); 1245 v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); 1246 v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); 1247 v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); 1248 v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); 1249 v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); 1250 v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); 1251 v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); 1252 v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); 1253 v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); 1254 v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); 1255 v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); 1256 v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); 1257 1258 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1259 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1260 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1261 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1262 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1263 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1264 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1265 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1266 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 1267 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 1268 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 1269 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 1270 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 1271 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 1272 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 1273 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 1274 u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); 1275 u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); 1276 u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); 1277 u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); 1278 u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); 1279 u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); 1280 u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); 1281 u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); 1282 u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); 1283 u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); 1284 u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); 1285 u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); 1286 u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); 1287 u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); 1288 u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); 1289 u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); 1290 1291 s[0] = _mm_packs_epi32(u[0], u[1]); 1292 s[1] = _mm_packs_epi32(u[2], u[3]); 1293 s[2] = _mm_packs_epi32(u[4], u[5]); 1294 s[3] = _mm_packs_epi32(u[6], u[7]); 1295 s[4] = _mm_packs_epi32(u[8], u[9]); 1296 s[5] = _mm_packs_epi32(u[10], u[11]); 1297 s[6] = _mm_packs_epi32(u[12], u[13]); 1298 s[7] = _mm_packs_epi32(u[14], u[15]); 1299 s[8] = _mm_packs_epi32(u[16], u[17]); 1300 s[9] = _mm_packs_epi32(u[18], u[19]); 1301 s[10] = _mm_packs_epi32(u[20], u[21]); 1302 s[11] = _mm_packs_epi32(u[22], u[23]); 1303 s[12] = _mm_packs_epi32(u[24], u[25]); 1304 s[13] = _mm_packs_epi32(u[26], u[27]); 1305 s[14] = _mm_packs_epi32(u[28], u[29]); 1306 s[15] = _mm_packs_epi32(u[30], u[31]); 1307 1308 // stage 2 1309 u[0] = _mm_unpacklo_epi16(s[8], s[9]); 1310 u[1] = _mm_unpackhi_epi16(s[8], s[9]); 1311 u[2] = _mm_unpacklo_epi16(s[10], s[11]); 1312 u[3] = _mm_unpackhi_epi16(s[10], s[11]); 1313 u[4] = _mm_unpacklo_epi16(s[12], s[13]); 1314 u[5] = _mm_unpackhi_epi16(s[12], s[13]); 1315 u[6] = _mm_unpacklo_epi16(s[14], s[15]); 1316 u[7] = _mm_unpackhi_epi16(s[14], s[15]); 1317 1318 v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); 1319 v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); 1320 v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); 1321 v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); 1322 v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); 1323 v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); 1324 v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); 1325 v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); 1326 v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); 1327 v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); 1328 v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); 1329 v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); 1330 v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); 1331 v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); 1332 v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); 1333 v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); 1334 1335 u[0] = _mm_add_epi32(v[0], v[8]); 1336 u[1] = _mm_add_epi32(v[1], v[9]); 1337 u[2] = _mm_add_epi32(v[2], v[10]); 1338 u[3] = _mm_add_epi32(v[3], v[11]); 1339 u[4] = _mm_add_epi32(v[4], v[12]); 1340 u[5] = _mm_add_epi32(v[5], v[13]); 1341 u[6] = _mm_add_epi32(v[6], v[14]); 1342 u[7] = _mm_add_epi32(v[7], v[15]); 1343 u[8] = _mm_sub_epi32(v[0], v[8]); 1344 u[9] = _mm_sub_epi32(v[1], v[9]); 1345 u[10] = _mm_sub_epi32(v[2], v[10]); 1346 u[11] = _mm_sub_epi32(v[3], v[11]); 1347 u[12] = _mm_sub_epi32(v[4], v[12]); 1348 u[13] = _mm_sub_epi32(v[5], v[13]); 1349 u[14] = _mm_sub_epi32(v[6], v[14]); 1350 u[15] = _mm_sub_epi32(v[7], v[15]); 1351 1352 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1353 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1354 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1355 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1356 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1357 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1358 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1359 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1360 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1361 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1362 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1363 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1364 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1365 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1366 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1367 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1368 1369 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1370 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1371 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1372 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1373 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1374 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1375 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1376 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1377 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 1378 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 1379 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 1380 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 1381 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 1382 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 1383 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 1384 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 1385 1386 x[0] = _mm_add_epi16(s[0], s[4]); 1387 x[1] = _mm_add_epi16(s[1], s[5]); 1388 x[2] = _mm_add_epi16(s[2], s[6]); 1389 x[3] = _mm_add_epi16(s[3], s[7]); 1390 x[4] = _mm_sub_epi16(s[0], s[4]); 1391 x[5] = _mm_sub_epi16(s[1], s[5]); 1392 x[6] = _mm_sub_epi16(s[2], s[6]); 1393 x[7] = _mm_sub_epi16(s[3], s[7]); 1394 x[8] = _mm_packs_epi32(u[0], u[1]); 1395 x[9] = _mm_packs_epi32(u[2], u[3]); 1396 x[10] = _mm_packs_epi32(u[4], u[5]); 1397 x[11] = _mm_packs_epi32(u[6], u[7]); 1398 x[12] = _mm_packs_epi32(u[8], u[9]); 1399 x[13] = _mm_packs_epi32(u[10], u[11]); 1400 x[14] = _mm_packs_epi32(u[12], u[13]); 1401 x[15] = _mm_packs_epi32(u[14], u[15]); 1402 1403 // stage 3 1404 u[0] = _mm_unpacklo_epi16(x[4], x[5]); 1405 u[1] = _mm_unpackhi_epi16(x[4], x[5]); 1406 u[2] = _mm_unpacklo_epi16(x[6], x[7]); 1407 u[3] = _mm_unpackhi_epi16(x[6], x[7]); 1408 u[4] = _mm_unpacklo_epi16(x[12], x[13]); 1409 u[5] = _mm_unpackhi_epi16(x[12], x[13]); 1410 u[6] = _mm_unpacklo_epi16(x[14], x[15]); 1411 u[7] = _mm_unpackhi_epi16(x[14], x[15]); 1412 1413 v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); 1414 v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); 1415 v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); 1416 v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); 1417 v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); 1418 v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); 1419 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); 1420 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); 1421 v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); 1422 v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); 1423 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); 1424 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); 1425 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); 1426 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); 1427 v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); 1428 v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); 1429 1430 u[0] = _mm_add_epi32(v[0], v[4]); 1431 u[1] = _mm_add_epi32(v[1], v[5]); 1432 u[2] = _mm_add_epi32(v[2], v[6]); 1433 u[3] = _mm_add_epi32(v[3], v[7]); 1434 u[4] = _mm_sub_epi32(v[0], v[4]); 1435 u[5] = _mm_sub_epi32(v[1], v[5]); 1436 u[6] = _mm_sub_epi32(v[2], v[6]); 1437 u[7] = _mm_sub_epi32(v[3], v[7]); 1438 u[8] = _mm_add_epi32(v[8], v[12]); 1439 u[9] = _mm_add_epi32(v[9], v[13]); 1440 u[10] = _mm_add_epi32(v[10], v[14]); 1441 u[11] = _mm_add_epi32(v[11], v[15]); 1442 u[12] = _mm_sub_epi32(v[8], v[12]); 1443 u[13] = _mm_sub_epi32(v[9], v[13]); 1444 u[14] = _mm_sub_epi32(v[10], v[14]); 1445 u[15] = _mm_sub_epi32(v[11], v[15]); 1446 1447 u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1448 u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1449 u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1450 u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1451 u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1452 u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1453 u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1454 u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1455 u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1456 u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1457 u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1458 u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1459 u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1460 u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1461 u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1462 u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1463 1464 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1465 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1466 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1467 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1468 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1469 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1470 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1471 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1472 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1473 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1474 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1475 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1476 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1477 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1478 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1479 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1480 1481 s[0] = _mm_add_epi16(x[0], x[2]); 1482 s[1] = _mm_add_epi16(x[1], x[3]); 1483 s[2] = _mm_sub_epi16(x[0], x[2]); 1484 s[3] = _mm_sub_epi16(x[1], x[3]); 1485 s[4] = _mm_packs_epi32(v[0], v[1]); 1486 s[5] = _mm_packs_epi32(v[2], v[3]); 1487 s[6] = _mm_packs_epi32(v[4], v[5]); 1488 s[7] = _mm_packs_epi32(v[6], v[7]); 1489 s[8] = _mm_add_epi16(x[8], x[10]); 1490 s[9] = _mm_add_epi16(x[9], x[11]); 1491 s[10] = _mm_sub_epi16(x[8], x[10]); 1492 s[11] = _mm_sub_epi16(x[9], x[11]); 1493 s[12] = _mm_packs_epi32(v[8], v[9]); 1494 s[13] = _mm_packs_epi32(v[10], v[11]); 1495 s[14] = _mm_packs_epi32(v[12], v[13]); 1496 s[15] = _mm_packs_epi32(v[14], v[15]); 1497 1498 // stage 4 1499 u[0] = _mm_unpacklo_epi16(s[2], s[3]); 1500 u[1] = _mm_unpackhi_epi16(s[2], s[3]); 1501 u[2] = _mm_unpacklo_epi16(s[6], s[7]); 1502 u[3] = _mm_unpackhi_epi16(s[6], s[7]); 1503 u[4] = _mm_unpacklo_epi16(s[10], s[11]); 1504 u[5] = _mm_unpackhi_epi16(s[10], s[11]); 1505 u[6] = _mm_unpacklo_epi16(s[14], s[15]); 1506 u[7] = _mm_unpackhi_epi16(s[14], s[15]); 1507 1508 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); 1509 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); 1510 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 1511 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); 1512 v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 1513 v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 1514 v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 1515 v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 1516 v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); 1517 v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); 1518 v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); 1519 v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); 1520 v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); 1521 v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); 1522 v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); 1523 v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); 1524 1525 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1526 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1527 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1528 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1529 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1530 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1531 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1532 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1533 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 1534 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 1535 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 1536 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 1537 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 1538 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 1539 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 1540 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 1541 1542 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1543 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1544 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1545 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1546 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1547 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1548 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1549 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1550 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1551 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1552 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1553 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1554 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1555 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1556 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1557 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1558 1559 in[0] = s[0]; 1560 in[1] = _mm_sub_epi16(kZero, s[8]); 1561 in[2] = s[12]; 1562 in[3] = _mm_sub_epi16(kZero, s[4]); 1563 in[4] = _mm_packs_epi32(v[4], v[5]); 1564 in[5] = _mm_packs_epi32(v[12], v[13]); 1565 in[6] = _mm_packs_epi32(v[8], v[9]); 1566 in[7] = _mm_packs_epi32(v[0], v[1]); 1567 in[8] = _mm_packs_epi32(v[2], v[3]); 1568 in[9] = _mm_packs_epi32(v[10], v[11]); 1569 in[10] = _mm_packs_epi32(v[14], v[15]); 1570 in[11] = _mm_packs_epi32(v[6], v[7]); 1571 in[12] = s[5]; 1572 in[13] = _mm_sub_epi16(kZero, s[13]); 1573 in[14] = s[9]; 1574 in[15] = _mm_sub_epi16(kZero, s[1]); 1575} 1576 1577static void idct16_8col(__m128i *in) { 1578 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); 1579 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); 1580 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); 1581 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); 1582 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); 1583 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); 1584 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); 1585 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); 1586 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1587 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); 1588 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1589 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); 1590 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); 1591 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1592 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1593 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 1594 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1595 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 1596 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1597 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1598 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1599 __m128i v[16], u[16], s[16], t[16]; 1600 1601 // stage 1 1602 s[0] = in[0]; 1603 s[1] = in[8]; 1604 s[2] = in[4]; 1605 s[3] = in[12]; 1606 s[4] = in[2]; 1607 s[5] = in[10]; 1608 s[6] = in[6]; 1609 s[7] = in[14]; 1610 s[8] = in[1]; 1611 s[9] = in[9]; 1612 s[10] = in[5]; 1613 s[11] = in[13]; 1614 s[12] = in[3]; 1615 s[13] = in[11]; 1616 s[14] = in[7]; 1617 s[15] = in[15]; 1618 1619 // stage 2 1620 u[0] = _mm_unpacklo_epi16(s[8], s[15]); 1621 u[1] = _mm_unpackhi_epi16(s[8], s[15]); 1622 u[2] = _mm_unpacklo_epi16(s[9], s[14]); 1623 u[3] = _mm_unpackhi_epi16(s[9], s[14]); 1624 u[4] = _mm_unpacklo_epi16(s[10], s[13]); 1625 u[5] = _mm_unpackhi_epi16(s[10], s[13]); 1626 u[6] = _mm_unpacklo_epi16(s[11], s[12]); 1627 u[7] = _mm_unpackhi_epi16(s[11], s[12]); 1628 1629 v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02); 1630 v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02); 1631 v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30); 1632 v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30); 1633 v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18); 1634 v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18); 1635 v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14); 1636 v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14); 1637 v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10); 1638 v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10); 1639 v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22); 1640 v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22); 1641 v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26); 1642 v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26); 1643 v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06); 1644 v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06); 1645 1646 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1647 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1648 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1649 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1650 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1651 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1652 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1653 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1654 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 1655 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 1656 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 1657 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 1658 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 1659 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 1660 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 1661 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 1662 1663 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1664 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1665 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1666 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1667 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1668 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1669 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1670 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1671 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1672 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1673 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1674 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1675 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1676 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1677 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1678 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1679 1680 s[8] = _mm_packs_epi32(u[0], u[1]); 1681 s[15] = _mm_packs_epi32(u[2], u[3]); 1682 s[9] = _mm_packs_epi32(u[4], u[5]); 1683 s[14] = _mm_packs_epi32(u[6], u[7]); 1684 s[10] = _mm_packs_epi32(u[8], u[9]); 1685 s[13] = _mm_packs_epi32(u[10], u[11]); 1686 s[11] = _mm_packs_epi32(u[12], u[13]); 1687 s[12] = _mm_packs_epi32(u[14], u[15]); 1688 1689 // stage 3 1690 t[0] = s[0]; 1691 t[1] = s[1]; 1692 t[2] = s[2]; 1693 t[3] = s[3]; 1694 u[0] = _mm_unpacklo_epi16(s[4], s[7]); 1695 u[1] = _mm_unpackhi_epi16(s[4], s[7]); 1696 u[2] = _mm_unpacklo_epi16(s[5], s[6]); 1697 u[3] = _mm_unpackhi_epi16(s[5], s[6]); 1698 1699 v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04); 1700 v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04); 1701 v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28); 1702 v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28); 1703 v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20); 1704 v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20); 1705 v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12); 1706 v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12); 1707 1708 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1709 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1710 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1711 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1712 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1713 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1714 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1715 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1716 1717 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1718 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1719 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1720 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1721 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1722 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1723 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1724 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1725 1726 t[4] = _mm_packs_epi32(u[0], u[1]); 1727 t[7] = _mm_packs_epi32(u[2], u[3]); 1728 t[5] = _mm_packs_epi32(u[4], u[5]); 1729 t[6] = _mm_packs_epi32(u[6], u[7]); 1730 t[8] = _mm_add_epi16(s[8], s[9]); 1731 t[9] = _mm_sub_epi16(s[8], s[9]); 1732 t[10] = _mm_sub_epi16(s[11], s[10]); 1733 t[11] = _mm_add_epi16(s[10], s[11]); 1734 t[12] = _mm_add_epi16(s[12], s[13]); 1735 t[13] = _mm_sub_epi16(s[12], s[13]); 1736 t[14] = _mm_sub_epi16(s[15], s[14]); 1737 t[15] = _mm_add_epi16(s[14], s[15]); 1738 1739 // stage 4 1740 u[0] = _mm_unpacklo_epi16(t[0], t[1]); 1741 u[1] = _mm_unpackhi_epi16(t[0], t[1]); 1742 u[2] = _mm_unpacklo_epi16(t[2], t[3]); 1743 u[3] = _mm_unpackhi_epi16(t[2], t[3]); 1744 u[4] = _mm_unpacklo_epi16(t[9], t[14]); 1745 u[5] = _mm_unpackhi_epi16(t[9], t[14]); 1746 u[6] = _mm_unpacklo_epi16(t[10], t[13]); 1747 u[7] = _mm_unpackhi_epi16(t[10], t[13]); 1748 1749 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 1750 v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 1751 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 1752 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); 1753 v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08); 1754 v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08); 1755 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); 1756 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); 1757 v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24); 1758 v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24); 1759 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08); 1760 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08); 1761 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08); 1762 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08); 1763 v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24); 1764 v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24); 1765 1766 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1767 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1768 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1769 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1770 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1771 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1772 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1773 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1774 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 1775 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 1776 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 1777 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 1778 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 1779 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 1780 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 1781 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 1782 1783 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1784 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1785 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1786 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1787 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1788 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1789 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1790 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1791 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1792 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1793 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1794 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1795 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1796 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1797 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1798 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1799 1800 s[0] = _mm_packs_epi32(u[0], u[1]); 1801 s[1] = _mm_packs_epi32(u[2], u[3]); 1802 s[2] = _mm_packs_epi32(u[4], u[5]); 1803 s[3] = _mm_packs_epi32(u[6], u[7]); 1804 s[4] = _mm_add_epi16(t[4], t[5]); 1805 s[5] = _mm_sub_epi16(t[4], t[5]); 1806 s[6] = _mm_sub_epi16(t[7], t[6]); 1807 s[7] = _mm_add_epi16(t[6], t[7]); 1808 s[8] = t[8]; 1809 s[15] = t[15]; 1810 s[9] = _mm_packs_epi32(u[8], u[9]); 1811 s[14] = _mm_packs_epi32(u[10], u[11]); 1812 s[10] = _mm_packs_epi32(u[12], u[13]); 1813 s[13] = _mm_packs_epi32(u[14], u[15]); 1814 s[11] = t[11]; 1815 s[12] = t[12]; 1816 1817 // stage 5 1818 t[0] = _mm_add_epi16(s[0], s[3]); 1819 t[1] = _mm_add_epi16(s[1], s[2]); 1820 t[2] = _mm_sub_epi16(s[1], s[2]); 1821 t[3] = _mm_sub_epi16(s[0], s[3]); 1822 t[4] = s[4]; 1823 t[7] = s[7]; 1824 1825 u[0] = _mm_unpacklo_epi16(s[5], s[6]); 1826 u[1] = _mm_unpackhi_epi16(s[5], s[6]); 1827 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 1828 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 1829 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 1830 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 1831 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1832 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1833 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1834 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1835 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1836 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1837 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1838 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1839 t[5] = _mm_packs_epi32(u[0], u[1]); 1840 t[6] = _mm_packs_epi32(u[2], u[3]); 1841 1842 t[8] = _mm_add_epi16(s[8], s[11]); 1843 t[9] = _mm_add_epi16(s[9], s[10]); 1844 t[10] = _mm_sub_epi16(s[9], s[10]); 1845 t[11] = _mm_sub_epi16(s[8], s[11]); 1846 t[12] = _mm_sub_epi16(s[15], s[12]); 1847 t[13] = _mm_sub_epi16(s[14], s[13]); 1848 t[14] = _mm_add_epi16(s[13], s[14]); 1849 t[15] = _mm_add_epi16(s[12], s[15]); 1850 1851 // stage 6 1852 s[0] = _mm_add_epi16(t[0], t[7]); 1853 s[1] = _mm_add_epi16(t[1], t[6]); 1854 s[2] = _mm_add_epi16(t[2], t[5]); 1855 s[3] = _mm_add_epi16(t[3], t[4]); 1856 s[4] = _mm_sub_epi16(t[3], t[4]); 1857 s[5] = _mm_sub_epi16(t[2], t[5]); 1858 s[6] = _mm_sub_epi16(t[1], t[6]); 1859 s[7] = _mm_sub_epi16(t[0], t[7]); 1860 s[8] = t[8]; 1861 s[9] = t[9]; 1862 1863 u[0] = _mm_unpacklo_epi16(t[10], t[13]); 1864 u[1] = _mm_unpackhi_epi16(t[10], t[13]); 1865 u[2] = _mm_unpacklo_epi16(t[11], t[12]); 1866 u[3] = _mm_unpackhi_epi16(t[11], t[12]); 1867 1868 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 1869 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 1870 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 1871 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 1872 v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 1873 v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 1874 v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 1875 v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 1876 1877 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1878 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1879 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1880 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1881 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1882 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1883 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1884 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1885 1886 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1887 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1888 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1889 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1890 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1891 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1892 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1893 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1894 1895 s[10] = _mm_packs_epi32(u[0], u[1]); 1896 s[13] = _mm_packs_epi32(u[2], u[3]); 1897 s[11] = _mm_packs_epi32(u[4], u[5]); 1898 s[12] = _mm_packs_epi32(u[6], u[7]); 1899 s[14] = t[14]; 1900 s[15] = t[15]; 1901 1902 // stage 7 1903 in[0] = _mm_add_epi16(s[0], s[15]); 1904 in[1] = _mm_add_epi16(s[1], s[14]); 1905 in[2] = _mm_add_epi16(s[2], s[13]); 1906 in[3] = _mm_add_epi16(s[3], s[12]); 1907 in[4] = _mm_add_epi16(s[4], s[11]); 1908 in[5] = _mm_add_epi16(s[5], s[10]); 1909 in[6] = _mm_add_epi16(s[6], s[9]); 1910 in[7] = _mm_add_epi16(s[7], s[8]); 1911 in[8] = _mm_sub_epi16(s[7], s[8]); 1912 in[9] = _mm_sub_epi16(s[6], s[9]); 1913 in[10] = _mm_sub_epi16(s[5], s[10]); 1914 in[11] = _mm_sub_epi16(s[4], s[11]); 1915 in[12] = _mm_sub_epi16(s[3], s[12]); 1916 in[13] = _mm_sub_epi16(s[2], s[13]); 1917 in[14] = _mm_sub_epi16(s[1], s[14]); 1918 in[15] = _mm_sub_epi16(s[0], s[15]); 1919} 1920 1921void idct16_sse2(__m128i *in0, __m128i *in1) { 1922 array_transpose_16x16(in0, in1); 1923 idct16_8col(in0); 1924 idct16_8col(in1); 1925} 1926 1927void iadst16_sse2(__m128i *in0, __m128i *in1) { 1928 array_transpose_16x16(in0, in1); 1929 iadst16_8col(in0); 1930 iadst16_8col(in1); 1931} 1932 1933void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, 1934 int stride) { 1935 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 1936 const __m128i final_rounding = _mm_set1_epi16(1 << 5); 1937 const __m128i zero = _mm_setzero_si128(); 1938 1939 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 1940 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 1941 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 1942 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 1943 1944 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1945 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 1946 1947 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 1948 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1949 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1950 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 1951 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1952 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1953 1954 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1955 __m128i in[16], l[16]; 1956 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8, 1957 stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0, 1958 stp1_12_0; 1959 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 1960 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14; 1961 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 1962 int i; 1963 // First 1-D inverse DCT 1964 // Load input data. 1965 in[0] = load_input_data(input); 1966 in[1] = load_input_data(input + 8 * 2); 1967 in[2] = load_input_data(input + 8 * 4); 1968 in[3] = load_input_data(input + 8 * 6); 1969 1970 TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]); 1971 1972 // Stage2 1973 { 1974 const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero); 1975 const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]); 1976 1977 tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); 1978 tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); 1979 tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); 1980 tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); 1981 1982 tmp0 = _mm_add_epi32(tmp0, rounding); 1983 tmp2 = _mm_add_epi32(tmp2, rounding); 1984 tmp5 = _mm_add_epi32(tmp5, rounding); 1985 tmp7 = _mm_add_epi32(tmp7, rounding); 1986 1987 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 1988 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 1989 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); 1990 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); 1991 1992 stp2_8 = _mm_packs_epi32(tmp0, tmp2); 1993 stp2_11 = _mm_packs_epi32(tmp5, tmp7); 1994 } 1995 1996 // Stage3 1997 { 1998 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero); 1999 2000 tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); 2001 tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); 2002 2003 tmp0 = _mm_add_epi32(tmp0, rounding); 2004 tmp2 = _mm_add_epi32(tmp2, rounding); 2005 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2006 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2007 2008 stp1_13 = _mm_unpackhi_epi64(stp2_11, zero); 2009 stp1_14 = _mm_unpackhi_epi64(stp2_8, zero); 2010 2011 stp1_4 = _mm_packs_epi32(tmp0, tmp2); 2012 } 2013 2014 // Stage4 2015 { 2016 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); 2017 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14); 2018 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13); 2019 2020 tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); 2021 tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); 2022 tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); 2023 tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); 2024 tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); 2025 tmp7 = _mm_madd_epi16(lo_10_13, stg4_7); 2026 2027 tmp0 = _mm_add_epi32(tmp0, rounding); 2028 tmp2 = _mm_add_epi32(tmp2, rounding); 2029 tmp1 = _mm_add_epi32(tmp1, rounding); 2030 tmp3 = _mm_add_epi32(tmp3, rounding); 2031 tmp5 = _mm_add_epi32(tmp5, rounding); 2032 tmp7 = _mm_add_epi32(tmp7, rounding); 2033 2034 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2035 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2036 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); 2037 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 2038 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); 2039 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); 2040 2041 stp1_0 = _mm_packs_epi32(tmp0, tmp0); 2042 stp1_1 = _mm_packs_epi32(tmp2, tmp2); 2043 stp2_9 = _mm_packs_epi32(tmp1, tmp3); 2044 stp2_10 = _mm_packs_epi32(tmp5, tmp7); 2045 2046 stp2_6 = _mm_unpackhi_epi64(stp1_4, zero); 2047 } 2048 2049 // Stage5 and Stage6 2050 { 2051 tmp0 = _mm_add_epi16(stp2_8, stp2_11); 2052 tmp1 = _mm_sub_epi16(stp2_8, stp2_11); 2053 tmp2 = _mm_add_epi16(stp2_9, stp2_10); 2054 tmp3 = _mm_sub_epi16(stp2_9, stp2_10); 2055 2056 stp1_9 = _mm_unpacklo_epi64(tmp2, zero); 2057 stp1_10 = _mm_unpacklo_epi64(tmp3, zero); 2058 stp1_8 = _mm_unpacklo_epi64(tmp0, zero); 2059 stp1_11 = _mm_unpacklo_epi64(tmp1, zero); 2060 2061 stp1_13 = _mm_unpackhi_epi64(tmp3, zero); 2062 stp1_14 = _mm_unpackhi_epi64(tmp2, zero); 2063 stp1_12 = _mm_unpackhi_epi64(tmp1, zero); 2064 stp1_15 = _mm_unpackhi_epi64(tmp0, zero); 2065 } 2066 2067 // Stage6 2068 { 2069 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4); 2070 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); 2071 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); 2072 2073 tmp1 = _mm_madd_epi16(lo_6_5, stg4_1); 2074 tmp3 = _mm_madd_epi16(lo_6_5, stg4_0); 2075 tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); 2076 tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); 2077 tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); 2078 tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); 2079 2080 tmp1 = _mm_add_epi32(tmp1, rounding); 2081 tmp3 = _mm_add_epi32(tmp3, rounding); 2082 tmp0 = _mm_add_epi32(tmp0, rounding); 2083 tmp2 = _mm_add_epi32(tmp2, rounding); 2084 tmp4 = _mm_add_epi32(tmp4, rounding); 2085 tmp6 = _mm_add_epi32(tmp6, rounding); 2086 2087 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); 2088 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 2089 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2090 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2091 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 2092 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 2093 2094 stp1_6 = _mm_packs_epi32(tmp3, tmp1); 2095 2096 stp2_10 = _mm_packs_epi32(tmp0, zero); 2097 stp2_13 = _mm_packs_epi32(tmp2, zero); 2098 stp2_11 = _mm_packs_epi32(tmp4, zero); 2099 stp2_12 = _mm_packs_epi32(tmp6, zero); 2100 2101 tmp0 = _mm_add_epi16(stp1_0, stp1_4); 2102 tmp1 = _mm_sub_epi16(stp1_0, stp1_4); 2103 tmp2 = _mm_add_epi16(stp1_1, stp1_6); 2104 tmp3 = _mm_sub_epi16(stp1_1, stp1_6); 2105 2106 stp2_0 = _mm_unpackhi_epi64(tmp0, zero); 2107 stp2_1 = _mm_unpacklo_epi64(tmp2, zero); 2108 stp2_2 = _mm_unpackhi_epi64(tmp2, zero); 2109 stp2_3 = _mm_unpacklo_epi64(tmp0, zero); 2110 stp2_4 = _mm_unpacklo_epi64(tmp1, zero); 2111 stp2_5 = _mm_unpackhi_epi64(tmp3, zero); 2112 stp2_6 = _mm_unpacklo_epi64(tmp3, zero); 2113 stp2_7 = _mm_unpackhi_epi64(tmp1, zero); 2114 } 2115 2116 // Stage7. Left 8x16 only. 2117 l[0] = _mm_add_epi16(stp2_0, stp1_15); 2118 l[1] = _mm_add_epi16(stp2_1, stp1_14); 2119 l[2] = _mm_add_epi16(stp2_2, stp2_13); 2120 l[3] = _mm_add_epi16(stp2_3, stp2_12); 2121 l[4] = _mm_add_epi16(stp2_4, stp2_11); 2122 l[5] = _mm_add_epi16(stp2_5, stp2_10); 2123 l[6] = _mm_add_epi16(stp2_6, stp1_9); 2124 l[7] = _mm_add_epi16(stp2_7, stp1_8); 2125 l[8] = _mm_sub_epi16(stp2_7, stp1_8); 2126 l[9] = _mm_sub_epi16(stp2_6, stp1_9); 2127 l[10] = _mm_sub_epi16(stp2_5, stp2_10); 2128 l[11] = _mm_sub_epi16(stp2_4, stp2_11); 2129 l[12] = _mm_sub_epi16(stp2_3, stp2_12); 2130 l[13] = _mm_sub_epi16(stp2_2, stp2_13); 2131 l[14] = _mm_sub_epi16(stp2_1, stp1_14); 2132 l[15] = _mm_sub_epi16(stp2_0, stp1_15); 2133 2134 // Second 1-D inverse transform, performed per 8x16 block 2135 for (i = 0; i < 2; i++) { 2136 int j; 2137 array_transpose_4X8(l + 8 * i, in); 2138 2139 IDCT16_10 2140 2141 // Stage7 2142 in[0] = _mm_add_epi16(stp2_0, stp1_15); 2143 in[1] = _mm_add_epi16(stp2_1, stp1_14); 2144 in[2] = _mm_add_epi16(stp2_2, stp2_13); 2145 in[3] = _mm_add_epi16(stp2_3, stp2_12); 2146 in[4] = _mm_add_epi16(stp2_4, stp2_11); 2147 in[5] = _mm_add_epi16(stp2_5, stp2_10); 2148 in[6] = _mm_add_epi16(stp2_6, stp1_9); 2149 in[7] = _mm_add_epi16(stp2_7, stp1_8); 2150 in[8] = _mm_sub_epi16(stp2_7, stp1_8); 2151 in[9] = _mm_sub_epi16(stp2_6, stp1_9); 2152 in[10] = _mm_sub_epi16(stp2_5, stp2_10); 2153 in[11] = _mm_sub_epi16(stp2_4, stp2_11); 2154 in[12] = _mm_sub_epi16(stp2_3, stp2_12); 2155 in[13] = _mm_sub_epi16(stp2_2, stp2_13); 2156 in[14] = _mm_sub_epi16(stp2_1, stp1_14); 2157 in[15] = _mm_sub_epi16(stp2_0, stp1_15); 2158 2159 for (j = 0; j < 16; ++j) { 2160 // Final rounding and shift 2161 in[j] = _mm_adds_epi16(in[j], final_rounding); 2162 in[j] = _mm_srai_epi16(in[j], 6); 2163 RECON_AND_STORE(dest + j * stride, in[j]); 2164 } 2165 2166 dest += 8; 2167 } 2168} 2169 2170#define LOAD_DQCOEFF(reg, input) \ 2171 { \ 2172 reg = load_input_data(input); \ 2173 input += 8; \ 2174 } 2175 2176#define IDCT32_34 \ 2177 /* Stage1 */ \ 2178 { \ 2179 const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \ 2180 const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \ 2181 \ 2182 const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]); \ 2183 const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \ 2184 \ 2185 const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \ 2186 const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \ 2187 \ 2188 const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \ 2189 const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \ 2190 \ 2191 MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16, \ 2192 stp1_31); \ 2193 MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19, \ 2194 stp1_28); \ 2195 MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20, \ 2196 stp1_27); \ 2197 MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23, \ 2198 stp1_24); \ 2199 } \ 2200 \ 2201 /* Stage2 */ \ 2202 { \ 2203 const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \ 2204 const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \ 2205 \ 2206 const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \ 2207 const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \ 2208 \ 2209 MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8, \ 2210 stp2_15); \ 2211 MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11, \ 2212 stp2_12); \ 2213 \ 2214 stp2_16 = stp1_16; \ 2215 stp2_19 = stp1_19; \ 2216 \ 2217 stp2_20 = stp1_20; \ 2218 stp2_23 = stp1_23; \ 2219 \ 2220 stp2_24 = stp1_24; \ 2221 stp2_27 = stp1_27; \ 2222 \ 2223 stp2_28 = stp1_28; \ 2224 stp2_31 = stp1_31; \ 2225 } \ 2226 \ 2227 /* Stage3 */ \ 2228 { \ 2229 const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \ 2230 const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \ 2231 \ 2232 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \ 2233 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \ 2234 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \ 2235 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \ 2236 \ 2237 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \ 2238 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \ 2239 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \ 2240 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \ 2241 \ 2242 MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4, \ 2243 stp1_7); \ 2244 \ 2245 stp1_8 = stp2_8; \ 2246 stp1_11 = stp2_11; \ 2247 stp1_12 = stp2_12; \ 2248 stp1_15 = stp2_15; \ 2249 \ 2250 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ 2251 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \ 2252 stp1_29) \ 2253 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ 2254 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \ 2255 stp1_25) \ 2256 \ 2257 stp1_16 = stp2_16; \ 2258 stp1_31 = stp2_31; \ 2259 stp1_19 = stp2_19; \ 2260 stp1_20 = stp2_20; \ 2261 stp1_23 = stp2_23; \ 2262 stp1_24 = stp2_24; \ 2263 stp1_27 = stp2_27; \ 2264 stp1_28 = stp2_28; \ 2265 } \ 2266 \ 2267 /* Stage4 */ \ 2268 { \ 2269 const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \ 2270 const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \ 2271 \ 2272 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \ 2273 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \ 2274 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \ 2275 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \ 2276 \ 2277 MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0, \ 2278 stp2_1); \ 2279 \ 2280 stp2_4 = stp1_4; \ 2281 stp2_5 = stp1_4; \ 2282 stp2_6 = stp1_7; \ 2283 stp2_7 = stp1_7; \ 2284 \ 2285 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ 2286 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \ 2287 stp2_13) \ 2288 \ 2289 stp2_8 = stp1_8; \ 2290 stp2_15 = stp1_15; \ 2291 stp2_11 = stp1_11; \ 2292 stp2_12 = stp1_12; \ 2293 \ 2294 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ 2295 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ 2296 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ 2297 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ 2298 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ 2299 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ 2300 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ 2301 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ 2302 \ 2303 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ 2304 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ 2305 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ 2306 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ 2307 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ 2308 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ 2309 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ 2310 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ 2311 } \ 2312 \ 2313 /* Stage5 */ \ 2314 { \ 2315 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 2316 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 2317 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ 2318 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ 2319 \ 2320 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ 2321 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ 2322 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ 2323 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ 2324 \ 2325 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 2326 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 2327 \ 2328 stp1_0 = stp2_0; \ 2329 stp1_1 = stp2_1; \ 2330 stp1_2 = stp2_1; \ 2331 stp1_3 = stp2_0; \ 2332 \ 2333 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 2334 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 2335 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 2336 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 2337 \ 2338 tmp0 = _mm_add_epi32(tmp0, rounding); \ 2339 tmp1 = _mm_add_epi32(tmp1, rounding); \ 2340 tmp2 = _mm_add_epi32(tmp2, rounding); \ 2341 tmp3 = _mm_add_epi32(tmp3, rounding); \ 2342 \ 2343 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 2344 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 2345 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 2346 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 2347 \ 2348 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 2349 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 2350 \ 2351 stp1_4 = stp2_4; \ 2352 stp1_7 = stp2_7; \ 2353 \ 2354 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ 2355 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 2356 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 2357 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ 2358 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ 2359 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 2360 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 2361 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ 2362 \ 2363 stp1_16 = stp2_16; \ 2364 stp1_17 = stp2_17; \ 2365 \ 2366 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ 2367 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \ 2368 stp1_28) \ 2369 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ 2370 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \ 2371 stp1_26) \ 2372 \ 2373 stp1_22 = stp2_22; \ 2374 stp1_23 = stp2_23; \ 2375 stp1_24 = stp2_24; \ 2376 stp1_25 = stp2_25; \ 2377 stp1_30 = stp2_30; \ 2378 stp1_31 = stp2_31; \ 2379 } \ 2380 \ 2381 /* Stage6 */ \ 2382 { \ 2383 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 2384 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 2385 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 2386 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 2387 \ 2388 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ 2389 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 2390 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 2391 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ 2392 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ 2393 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 2394 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 2395 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ 2396 \ 2397 stp2_8 = stp1_8; \ 2398 stp2_9 = stp1_9; \ 2399 stp2_14 = stp1_14; \ 2400 stp2_15 = stp1_15; \ 2401 \ 2402 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ 2403 stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ 2404 stp2_12) \ 2405 \ 2406 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ 2407 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ 2408 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ 2409 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ 2410 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ 2411 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ 2412 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ 2413 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ 2414 \ 2415 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ 2416 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ 2417 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ 2418 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ 2419 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ 2420 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ 2421 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ 2422 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ 2423 } \ 2424 \ 2425 /* Stage7 */ \ 2426 { \ 2427 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ 2428 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ 2429 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 2430 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 2431 \ 2432 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ 2433 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ 2434 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ 2435 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ 2436 \ 2437 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ 2438 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ 2439 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ 2440 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ 2441 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ 2442 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ 2443 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ 2444 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ 2445 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ 2446 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ 2447 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ 2448 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ 2449 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ 2450 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ 2451 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ 2452 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ 2453 \ 2454 stp1_16 = stp2_16; \ 2455 stp1_17 = stp2_17; \ 2456 stp1_18 = stp2_18; \ 2457 stp1_19 = stp2_19; \ 2458 \ 2459 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ 2460 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \ 2461 stp1_26) \ 2462 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ 2463 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \ 2464 stp1_24) \ 2465 \ 2466 stp1_28 = stp2_28; \ 2467 stp1_29 = stp2_29; \ 2468 stp1_30 = stp2_30; \ 2469 stp1_31 = stp2_31; \ 2470 } 2471 2472#define IDCT32 \ 2473 /* Stage1 */ \ 2474 { \ 2475 const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \ 2476 const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \ 2477 const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \ 2478 const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \ 2479 \ 2480 const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \ 2481 const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \ 2482 const __m128i lo_25_7 = _mm_unpacklo_epi16(in[25], in[7]); \ 2483 const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \ 2484 \ 2485 const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \ 2486 const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \ 2487 const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \ 2488 const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \ 2489 \ 2490 const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \ 2491 const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \ 2492 const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \ 2493 const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \ 2494 \ 2495 MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ 2496 stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17, \ 2497 stp1_30) \ 2498 MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \ 2499 stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \ 2500 MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ 2501 stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ 2502 stp1_21, stp1_26) \ 2503 MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ 2504 stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ 2505 stp1_23, stp1_24) \ 2506 } \ 2507 \ 2508 /* Stage2 */ \ 2509 { \ 2510 const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \ 2511 const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \ 2512 const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \ 2513 const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \ 2514 \ 2515 const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \ 2516 const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \ 2517 const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \ 2518 const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \ 2519 \ 2520 MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ 2521 stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ 2522 stp2_14) \ 2523 MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ 2524 stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, \ 2525 stp2_12) \ 2526 \ 2527 stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ 2528 stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ 2529 stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ 2530 stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ 2531 \ 2532 stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ 2533 stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ 2534 stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ 2535 stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ 2536 \ 2537 stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ 2538 stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ 2539 stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ 2540 stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ 2541 \ 2542 stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ 2543 stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ 2544 stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ 2545 stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ 2546 } \ 2547 \ 2548 /* Stage3 */ \ 2549 { \ 2550 const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \ 2551 const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \ 2552 const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \ 2553 const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \ 2554 \ 2555 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ 2556 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ 2557 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ 2558 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ 2559 \ 2560 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 2561 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 2562 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ 2563 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ 2564 \ 2565 MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \ 2566 stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \ 2567 stp1_6) \ 2568 \ 2569 stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ 2570 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ 2571 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ 2572 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ 2573 stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ 2574 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ 2575 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ 2576 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ 2577 \ 2578 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ 2579 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \ 2580 stp1_29) \ 2581 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ 2582 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \ 2583 stp1_25) \ 2584 \ 2585 stp1_16 = stp2_16; \ 2586 stp1_31 = stp2_31; \ 2587 stp1_19 = stp2_19; \ 2588 stp1_20 = stp2_20; \ 2589 stp1_23 = stp2_23; \ 2590 stp1_24 = stp2_24; \ 2591 stp1_27 = stp2_27; \ 2592 stp1_28 = stp2_28; \ 2593 } \ 2594 \ 2595 /* Stage4 */ \ 2596 { \ 2597 const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \ 2598 const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \ 2599 const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \ 2600 const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \ 2601 \ 2602 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ 2603 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ 2604 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 2605 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 2606 \ 2607 MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \ 2608 stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \ 2609 \ 2610 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ 2611 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ 2612 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ 2613 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ 2614 \ 2615 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ 2616 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \ 2617 stp2_13) \ 2618 \ 2619 stp2_8 = stp1_8; \ 2620 stp2_15 = stp1_15; \ 2621 stp2_11 = stp1_11; \ 2622 stp2_12 = stp1_12; \ 2623 \ 2624 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ 2625 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ 2626 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ 2627 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ 2628 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ 2629 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ 2630 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ 2631 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ 2632 \ 2633 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ 2634 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ 2635 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ 2636 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ 2637 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ 2638 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ 2639 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ 2640 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ 2641 } \ 2642 \ 2643 /* Stage5 */ \ 2644 { \ 2645 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 2646 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 2647 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ 2648 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ 2649 \ 2650 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ 2651 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ 2652 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ 2653 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ 2654 \ 2655 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 2656 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 2657 \ 2658 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ 2659 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ 2660 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ 2661 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ 2662 \ 2663 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 2664 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 2665 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 2666 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 2667 \ 2668 tmp0 = _mm_add_epi32(tmp0, rounding); \ 2669 tmp1 = _mm_add_epi32(tmp1, rounding); \ 2670 tmp2 = _mm_add_epi32(tmp2, rounding); \ 2671 tmp3 = _mm_add_epi32(tmp3, rounding); \ 2672 \ 2673 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 2674 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 2675 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 2676 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 2677 \ 2678 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 2679 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 2680 \ 2681 stp1_4 = stp2_4; \ 2682 stp1_7 = stp2_7; \ 2683 \ 2684 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ 2685 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 2686 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 2687 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ 2688 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ 2689 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 2690 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 2691 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ 2692 \ 2693 stp1_16 = stp2_16; \ 2694 stp1_17 = stp2_17; \ 2695 \ 2696 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ 2697 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \ 2698 stp1_28) \ 2699 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ 2700 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \ 2701 stp1_26) \ 2702 \ 2703 stp1_22 = stp2_22; \ 2704 stp1_23 = stp2_23; \ 2705 stp1_24 = stp2_24; \ 2706 stp1_25 = stp2_25; \ 2707 stp1_30 = stp2_30; \ 2708 stp1_31 = stp2_31; \ 2709 } \ 2710 \ 2711 /* Stage6 */ \ 2712 { \ 2713 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 2714 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 2715 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 2716 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 2717 \ 2718 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ 2719 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 2720 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 2721 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ 2722 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ 2723 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 2724 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 2725 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ 2726 \ 2727 stp2_8 = stp1_8; \ 2728 stp2_9 = stp1_9; \ 2729 stp2_14 = stp1_14; \ 2730 stp2_15 = stp1_15; \ 2731 \ 2732 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ 2733 stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ 2734 stp2_12) \ 2735 \ 2736 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ 2737 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ 2738 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ 2739 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ 2740 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ 2741 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ 2742 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ 2743 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ 2744 \ 2745 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ 2746 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ 2747 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ 2748 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ 2749 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ 2750 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ 2751 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ 2752 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ 2753 } \ 2754 \ 2755 /* Stage7 */ \ 2756 { \ 2757 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ 2758 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ 2759 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 2760 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 2761 \ 2762 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ 2763 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ 2764 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ 2765 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ 2766 \ 2767 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ 2768 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ 2769 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ 2770 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ 2771 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ 2772 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ 2773 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ 2774 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ 2775 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ 2776 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ 2777 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ 2778 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ 2779 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ 2780 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ 2781 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ 2782 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ 2783 \ 2784 stp1_16 = stp2_16; \ 2785 stp1_17 = stp2_17; \ 2786 stp1_18 = stp2_18; \ 2787 stp1_19 = stp2_19; \ 2788 \ 2789 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ 2790 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \ 2791 stp1_26) \ 2792 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ 2793 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \ 2794 stp1_24) \ 2795 \ 2796 stp1_28 = stp2_28; \ 2797 stp1_29 = stp2_29; \ 2798 stp1_30 = stp2_30; \ 2799 stp1_31 = stp2_31; \ 2800 } 2801 2802// Only upper-left 8x8 has non-zero coeff 2803void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, 2804 int stride) { 2805 const __m128i zero = _mm_setzero_si128(); 2806 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 2807 const __m128i final_rounding = _mm_set1_epi16(1 << 5); 2808 2809 // idct constants for each stage 2810 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); 2811 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); 2812 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); 2813 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); 2814 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); 2815 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); 2816 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); 2817 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); 2818 2819 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 2820 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 2821 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 2822 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 2823 2824 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 2825 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 2826 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); 2827 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); 2828 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); 2829 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); 2830 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); 2831 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); 2832 2833 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 2834 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 2835 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 2836 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 2837 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 2838 2839 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 2840 2841 __m128i in[32], col[32]; 2842 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 2843 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 2844 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23, 2845 stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31; 2846 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 2847 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, 2848 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23, 2849 stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31; 2850 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 2851 int i; 2852 2853 // Load input data. Only need to load the top left 8x8 block. 2854 in[0] = load_input_data(input); 2855 in[1] = load_input_data(input + 32); 2856 in[2] = load_input_data(input + 64); 2857 in[3] = load_input_data(input + 96); 2858 in[4] = load_input_data(input + 128); 2859 in[5] = load_input_data(input + 160); 2860 in[6] = load_input_data(input + 192); 2861 in[7] = load_input_data(input + 224); 2862 2863 array_transpose_8x8(in, in); 2864 IDCT32_34 2865 2866 // 1_D: Store 32 intermediate results for each 8x32 block. 2867 col[0] = _mm_add_epi16(stp1_0, stp1_31); 2868 col[1] = _mm_add_epi16(stp1_1, stp1_30); 2869 col[2] = _mm_add_epi16(stp1_2, stp1_29); 2870 col[3] = _mm_add_epi16(stp1_3, stp1_28); 2871 col[4] = _mm_add_epi16(stp1_4, stp1_27); 2872 col[5] = _mm_add_epi16(stp1_5, stp1_26); 2873 col[6] = _mm_add_epi16(stp1_6, stp1_25); 2874 col[7] = _mm_add_epi16(stp1_7, stp1_24); 2875 col[8] = _mm_add_epi16(stp1_8, stp1_23); 2876 col[9] = _mm_add_epi16(stp1_9, stp1_22); 2877 col[10] = _mm_add_epi16(stp1_10, stp1_21); 2878 col[11] = _mm_add_epi16(stp1_11, stp1_20); 2879 col[12] = _mm_add_epi16(stp1_12, stp1_19); 2880 col[13] = _mm_add_epi16(stp1_13, stp1_18); 2881 col[14] = _mm_add_epi16(stp1_14, stp1_17); 2882 col[15] = _mm_add_epi16(stp1_15, stp1_16); 2883 col[16] = _mm_sub_epi16(stp1_15, stp1_16); 2884 col[17] = _mm_sub_epi16(stp1_14, stp1_17); 2885 col[18] = _mm_sub_epi16(stp1_13, stp1_18); 2886 col[19] = _mm_sub_epi16(stp1_12, stp1_19); 2887 col[20] = _mm_sub_epi16(stp1_11, stp1_20); 2888 col[21] = _mm_sub_epi16(stp1_10, stp1_21); 2889 col[22] = _mm_sub_epi16(stp1_9, stp1_22); 2890 col[23] = _mm_sub_epi16(stp1_8, stp1_23); 2891 col[24] = _mm_sub_epi16(stp1_7, stp1_24); 2892 col[25] = _mm_sub_epi16(stp1_6, stp1_25); 2893 col[26] = _mm_sub_epi16(stp1_5, stp1_26); 2894 col[27] = _mm_sub_epi16(stp1_4, stp1_27); 2895 col[28] = _mm_sub_epi16(stp1_3, stp1_28); 2896 col[29] = _mm_sub_epi16(stp1_2, stp1_29); 2897 col[30] = _mm_sub_epi16(stp1_1, stp1_30); 2898 col[31] = _mm_sub_epi16(stp1_0, stp1_31); 2899 for (i = 0; i < 4; i++) { 2900 int j; 2901 // Transpose 32x8 block to 8x32 block 2902 array_transpose_8x8(col + i * 8, in); 2903 IDCT32_34 2904 2905 // 2_D: Calculate the results and store them to destination. 2906 in[0] = _mm_add_epi16(stp1_0, stp1_31); 2907 in[1] = _mm_add_epi16(stp1_1, stp1_30); 2908 in[2] = _mm_add_epi16(stp1_2, stp1_29); 2909 in[3] = _mm_add_epi16(stp1_3, stp1_28); 2910 in[4] = _mm_add_epi16(stp1_4, stp1_27); 2911 in[5] = _mm_add_epi16(stp1_5, stp1_26); 2912 in[6] = _mm_add_epi16(stp1_6, stp1_25); 2913 in[7] = _mm_add_epi16(stp1_7, stp1_24); 2914 in[8] = _mm_add_epi16(stp1_8, stp1_23); 2915 in[9] = _mm_add_epi16(stp1_9, stp1_22); 2916 in[10] = _mm_add_epi16(stp1_10, stp1_21); 2917 in[11] = _mm_add_epi16(stp1_11, stp1_20); 2918 in[12] = _mm_add_epi16(stp1_12, stp1_19); 2919 in[13] = _mm_add_epi16(stp1_13, stp1_18); 2920 in[14] = _mm_add_epi16(stp1_14, stp1_17); 2921 in[15] = _mm_add_epi16(stp1_15, stp1_16); 2922 in[16] = _mm_sub_epi16(stp1_15, stp1_16); 2923 in[17] = _mm_sub_epi16(stp1_14, stp1_17); 2924 in[18] = _mm_sub_epi16(stp1_13, stp1_18); 2925 in[19] = _mm_sub_epi16(stp1_12, stp1_19); 2926 in[20] = _mm_sub_epi16(stp1_11, stp1_20); 2927 in[21] = _mm_sub_epi16(stp1_10, stp1_21); 2928 in[22] = _mm_sub_epi16(stp1_9, stp1_22); 2929 in[23] = _mm_sub_epi16(stp1_8, stp1_23); 2930 in[24] = _mm_sub_epi16(stp1_7, stp1_24); 2931 in[25] = _mm_sub_epi16(stp1_6, stp1_25); 2932 in[26] = _mm_sub_epi16(stp1_5, stp1_26); 2933 in[27] = _mm_sub_epi16(stp1_4, stp1_27); 2934 in[28] = _mm_sub_epi16(stp1_3, stp1_28); 2935 in[29] = _mm_sub_epi16(stp1_2, stp1_29); 2936 in[30] = _mm_sub_epi16(stp1_1, stp1_30); 2937 in[31] = _mm_sub_epi16(stp1_0, stp1_31); 2938 2939 for (j = 0; j < 32; ++j) { 2940 // Final rounding and shift 2941 in[j] = _mm_adds_epi16(in[j], final_rounding); 2942 in[j] = _mm_srai_epi16(in[j], 6); 2943 RECON_AND_STORE(dest + j * stride, in[j]); 2944 } 2945 2946 dest += 8; 2947 } 2948} 2949 2950void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, 2951 int stride) { 2952 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 2953 const __m128i final_rounding = _mm_set1_epi16(1 << 5); 2954 const __m128i zero = _mm_setzero_si128(); 2955 2956 // idct constants for each stage 2957 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); 2958 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); 2959 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); 2960 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); 2961 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); 2962 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); 2963 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); 2964 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); 2965 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); 2966 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); 2967 const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); 2968 const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); 2969 const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); 2970 const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); 2971 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); 2972 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); 2973 2974 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 2975 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 2976 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 2977 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 2978 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 2979 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 2980 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 2981 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 2982 2983 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 2984 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 2985 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 2986 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 2987 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); 2988 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); 2989 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); 2990 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); 2991 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); 2992 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); 2993 2994 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 2995 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 2996 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 2997 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 2998 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 2999 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 3000 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 3001 3002 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 3003 3004 __m128i in[32], col[128], zero_idx[16]; 3005 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 3006 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 3007 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23, 3008 stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31; 3009 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 3010 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, 3011 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23, 3012 stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31; 3013 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 3014 int i, j, i32; 3015 3016 for (i = 0; i < 4; i++) { 3017 i32 = (i << 5); 3018 // First 1-D idct 3019 // Load input data. 3020 LOAD_DQCOEFF(in[0], input); 3021 LOAD_DQCOEFF(in[8], input); 3022 LOAD_DQCOEFF(in[16], input); 3023 LOAD_DQCOEFF(in[24], input); 3024 LOAD_DQCOEFF(in[1], input); 3025 LOAD_DQCOEFF(in[9], input); 3026 LOAD_DQCOEFF(in[17], input); 3027 LOAD_DQCOEFF(in[25], input); 3028 LOAD_DQCOEFF(in[2], input); 3029 LOAD_DQCOEFF(in[10], input); 3030 LOAD_DQCOEFF(in[18], input); 3031 LOAD_DQCOEFF(in[26], input); 3032 LOAD_DQCOEFF(in[3], input); 3033 LOAD_DQCOEFF(in[11], input); 3034 LOAD_DQCOEFF(in[19], input); 3035 LOAD_DQCOEFF(in[27], input); 3036 3037 LOAD_DQCOEFF(in[4], input); 3038 LOAD_DQCOEFF(in[12], input); 3039 LOAD_DQCOEFF(in[20], input); 3040 LOAD_DQCOEFF(in[28], input); 3041 LOAD_DQCOEFF(in[5], input); 3042 LOAD_DQCOEFF(in[13], input); 3043 LOAD_DQCOEFF(in[21], input); 3044 LOAD_DQCOEFF(in[29], input); 3045 LOAD_DQCOEFF(in[6], input); 3046 LOAD_DQCOEFF(in[14], input); 3047 LOAD_DQCOEFF(in[22], input); 3048 LOAD_DQCOEFF(in[30], input); 3049 LOAD_DQCOEFF(in[7], input); 3050 LOAD_DQCOEFF(in[15], input); 3051 LOAD_DQCOEFF(in[23], input); 3052 LOAD_DQCOEFF(in[31], input); 3053 3054 // checking if all entries are zero 3055 zero_idx[0] = _mm_or_si128(in[0], in[1]); 3056 zero_idx[1] = _mm_or_si128(in[2], in[3]); 3057 zero_idx[2] = _mm_or_si128(in[4], in[5]); 3058 zero_idx[3] = _mm_or_si128(in[6], in[7]); 3059 zero_idx[4] = _mm_or_si128(in[8], in[9]); 3060 zero_idx[5] = _mm_or_si128(in[10], in[11]); 3061 zero_idx[6] = _mm_or_si128(in[12], in[13]); 3062 zero_idx[7] = _mm_or_si128(in[14], in[15]); 3063 zero_idx[8] = _mm_or_si128(in[16], in[17]); 3064 zero_idx[9] = _mm_or_si128(in[18], in[19]); 3065 zero_idx[10] = _mm_or_si128(in[20], in[21]); 3066 zero_idx[11] = _mm_or_si128(in[22], in[23]); 3067 zero_idx[12] = _mm_or_si128(in[24], in[25]); 3068 zero_idx[13] = _mm_or_si128(in[26], in[27]); 3069 zero_idx[14] = _mm_or_si128(in[28], in[29]); 3070 zero_idx[15] = _mm_or_si128(in[30], in[31]); 3071 3072 zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); 3073 zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); 3074 zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); 3075 zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); 3076 zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); 3077 zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); 3078 zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); 3079 zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); 3080 3081 zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); 3082 zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); 3083 zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); 3084 zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); 3085 zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); 3086 zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); 3087 zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); 3088 3089 if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) { 3090 col[i32 + 0] = _mm_setzero_si128(); 3091 col[i32 + 1] = _mm_setzero_si128(); 3092 col[i32 + 2] = _mm_setzero_si128(); 3093 col[i32 + 3] = _mm_setzero_si128(); 3094 col[i32 + 4] = _mm_setzero_si128(); 3095 col[i32 + 5] = _mm_setzero_si128(); 3096 col[i32 + 6] = _mm_setzero_si128(); 3097 col[i32 + 7] = _mm_setzero_si128(); 3098 col[i32 + 8] = _mm_setzero_si128(); 3099 col[i32 + 9] = _mm_setzero_si128(); 3100 col[i32 + 10] = _mm_setzero_si128(); 3101 col[i32 + 11] = _mm_setzero_si128(); 3102 col[i32 + 12] = _mm_setzero_si128(); 3103 col[i32 + 13] = _mm_setzero_si128(); 3104 col[i32 + 14] = _mm_setzero_si128(); 3105 col[i32 + 15] = _mm_setzero_si128(); 3106 col[i32 + 16] = _mm_setzero_si128(); 3107 col[i32 + 17] = _mm_setzero_si128(); 3108 col[i32 + 18] = _mm_setzero_si128(); 3109 col[i32 + 19] = _mm_setzero_si128(); 3110 col[i32 + 20] = _mm_setzero_si128(); 3111 col[i32 + 21] = _mm_setzero_si128(); 3112 col[i32 + 22] = _mm_setzero_si128(); 3113 col[i32 + 23] = _mm_setzero_si128(); 3114 col[i32 + 24] = _mm_setzero_si128(); 3115 col[i32 + 25] = _mm_setzero_si128(); 3116 col[i32 + 26] = _mm_setzero_si128(); 3117 col[i32 + 27] = _mm_setzero_si128(); 3118 col[i32 + 28] = _mm_setzero_si128(); 3119 col[i32 + 29] = _mm_setzero_si128(); 3120 col[i32 + 30] = _mm_setzero_si128(); 3121 col[i32 + 31] = _mm_setzero_si128(); 3122 continue; 3123 } 3124 3125 // Transpose 32x8 block to 8x32 block 3126 array_transpose_8x8(in, in); 3127 array_transpose_8x8(in + 8, in + 8); 3128 array_transpose_8x8(in + 16, in + 16); 3129 array_transpose_8x8(in + 24, in + 24); 3130 3131 IDCT32 3132 3133 // 1_D: Store 32 intermediate results for each 8x32 block. 3134 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); 3135 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); 3136 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); 3137 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); 3138 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); 3139 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); 3140 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); 3141 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); 3142 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); 3143 col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); 3144 col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); 3145 col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); 3146 col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); 3147 col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); 3148 col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); 3149 col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); 3150 col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); 3151 col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); 3152 col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); 3153 col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); 3154 col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); 3155 col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); 3156 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); 3157 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); 3158 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); 3159 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); 3160 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); 3161 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); 3162 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); 3163 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); 3164 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); 3165 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); 3166 } 3167 for (i = 0; i < 4; i++) { 3168 // Second 1-D idct 3169 j = i << 3; 3170 3171 // Transpose 32x8 block to 8x32 block 3172 array_transpose_8x8(col + j, in); 3173 array_transpose_8x8(col + j + 32, in + 8); 3174 array_transpose_8x8(col + j + 64, in + 16); 3175 array_transpose_8x8(col + j + 96, in + 24); 3176 3177 IDCT32 3178 3179 // 2_D: Calculate the results and store them to destination. 3180 in[0] = _mm_add_epi16(stp1_0, stp1_31); 3181 in[1] = _mm_add_epi16(stp1_1, stp1_30); 3182 in[2] = _mm_add_epi16(stp1_2, stp1_29); 3183 in[3] = _mm_add_epi16(stp1_3, stp1_28); 3184 in[4] = _mm_add_epi16(stp1_4, stp1_27); 3185 in[5] = _mm_add_epi16(stp1_5, stp1_26); 3186 in[6] = _mm_add_epi16(stp1_6, stp1_25); 3187 in[7] = _mm_add_epi16(stp1_7, stp1_24); 3188 in[8] = _mm_add_epi16(stp1_8, stp1_23); 3189 in[9] = _mm_add_epi16(stp1_9, stp1_22); 3190 in[10] = _mm_add_epi16(stp1_10, stp1_21); 3191 in[11] = _mm_add_epi16(stp1_11, stp1_20); 3192 in[12] = _mm_add_epi16(stp1_12, stp1_19); 3193 in[13] = _mm_add_epi16(stp1_13, stp1_18); 3194 in[14] = _mm_add_epi16(stp1_14, stp1_17); 3195 in[15] = _mm_add_epi16(stp1_15, stp1_16); 3196 in[16] = _mm_sub_epi16(stp1_15, stp1_16); 3197 in[17] = _mm_sub_epi16(stp1_14, stp1_17); 3198 in[18] = _mm_sub_epi16(stp1_13, stp1_18); 3199 in[19] = _mm_sub_epi16(stp1_12, stp1_19); 3200 in[20] = _mm_sub_epi16(stp1_11, stp1_20); 3201 in[21] = _mm_sub_epi16(stp1_10, stp1_21); 3202 in[22] = _mm_sub_epi16(stp1_9, stp1_22); 3203 in[23] = _mm_sub_epi16(stp1_8, stp1_23); 3204 in[24] = _mm_sub_epi16(stp1_7, stp1_24); 3205 in[25] = _mm_sub_epi16(stp1_6, stp1_25); 3206 in[26] = _mm_sub_epi16(stp1_5, stp1_26); 3207 in[27] = _mm_sub_epi16(stp1_4, stp1_27); 3208 in[28] = _mm_sub_epi16(stp1_3, stp1_28); 3209 in[29] = _mm_sub_epi16(stp1_2, stp1_29); 3210 in[30] = _mm_sub_epi16(stp1_1, stp1_30); 3211 in[31] = _mm_sub_epi16(stp1_0, stp1_31); 3212 3213 for (j = 0; j < 32; ++j) { 3214 // Final rounding and shift 3215 in[j] = _mm_adds_epi16(in[j], final_rounding); 3216 in[j] = _mm_srai_epi16(in[j], 6); 3217 RECON_AND_STORE(dest + j * stride, in[j]); 3218 } 3219 3220 dest += 8; 3221 } 3222} 3223 3224void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, 3225 int stride) { 3226 __m128i dc_value; 3227 const __m128i zero = _mm_setzero_si128(); 3228 int a, j; 3229 3230 a = (int)dct_const_round_shift(input[0] * cospi_16_64); 3231 a = (int)dct_const_round_shift(a * cospi_16_64); 3232 a = ROUND_POWER_OF_TWO(a, 6); 3233 3234 dc_value = _mm_set1_epi16(a); 3235 3236 for (j = 0; j < 32; ++j) { 3237 RECON_AND_STORE(dest + 0 + j * stride, dc_value); 3238 RECON_AND_STORE(dest + 8 + j * stride, dc_value); 3239 RECON_AND_STORE(dest + 16 + j * stride, dc_value); 3240 RECON_AND_STORE(dest + 24 + j * stride, dc_value); 3241 } 3242} 3243