1/* 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "vp9/common/x86/vp9_idct_intrin_sse2.h" 12 13#define RECON_AND_STORE4X4(dest, in_x) \ 14{ \ 15 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ 16 d0 = _mm_unpacklo_epi8(d0, zero); \ 17 d0 = _mm_add_epi16(in_x, d0); \ 18 d0 = _mm_packus_epi16(d0, d0); \ 19 *(int *)dest = _mm_cvtsi128_si32(d0); \ 20 dest += stride; \ 21} 22 23void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 24 const __m128i zero = _mm_setzero_si128(); 25 const __m128i eight = _mm_set1_epi16(8); 26 const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, 27 (int16_t)cospi_16_64, (int16_t)-cospi_16_64, 28 (int16_t)cospi_24_64, (int16_t)-cospi_8_64, 29 (int16_t)cospi_8_64, (int16_t)cospi_24_64); 30 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 31 __m128i input0, input1, input2, input3; 32 33 // Rows 34 input0 = _mm_load_si128((const __m128i *)input); 35 input2 = _mm_load_si128((const __m128i *)(input + 8)); 36 37 // Construct i3, i1, i3, i1, i2, i0, i2, i0 38 input0 = _mm_shufflelo_epi16(input0, 0xd8); 39 input0 = _mm_shufflehi_epi16(input0, 0xd8); 40 input2 = _mm_shufflelo_epi16(input2, 0xd8); 41 input2 = _mm_shufflehi_epi16(input2, 0xd8); 42 43 input1 = _mm_unpackhi_epi32(input0, input0); 44 input0 = _mm_unpacklo_epi32(input0, input0); 45 input3 = _mm_unpackhi_epi32(input2, input2); 46 input2 = _mm_unpacklo_epi32(input2, input2); 47 48 // Stage 1 49 input0 = _mm_madd_epi16(input0, cst); 50 input1 = _mm_madd_epi16(input1, cst); 51 input2 = _mm_madd_epi16(input2, cst); 52 input3 = _mm_madd_epi16(input3, cst); 53 54 input0 = _mm_add_epi32(input0, rounding); 55 input1 = _mm_add_epi32(input1, rounding); 56 input2 = _mm_add_epi32(input2, rounding); 57 input3 = _mm_add_epi32(input3, rounding); 58 59 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); 60 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); 61 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); 62 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); 63 64 // Stage 2 65 input0 = _mm_packs_epi32(input0, input1); 66 input1 = _mm_packs_epi32(input2, input3); 67 68 // Transpose 69 input2 = _mm_unpacklo_epi16(input0, input1); 70 input3 = _mm_unpackhi_epi16(input0, input1); 71 input0 = _mm_unpacklo_epi32(input2, input3); 72 input1 = _mm_unpackhi_epi32(input2, input3); 73 74 // Switch column2, column 3, and then, we got: 75 // input2: column1, column 0; input3: column2, column 3. 76 input1 = _mm_shuffle_epi32(input1, 0x4e); 77 input2 = _mm_add_epi16(input0, input1); 78 input3 = _mm_sub_epi16(input0, input1); 79 80 // Columns 81 // Construct i3, i1, i3, i1, i2, i0, i2, i0 82 input0 = _mm_unpacklo_epi32(input2, input2); 83 input1 = _mm_unpackhi_epi32(input2, input2); 84 input2 = _mm_unpackhi_epi32(input3, input3); 85 input3 = _mm_unpacklo_epi32(input3, input3); 86 87 // Stage 1 88 input0 = _mm_madd_epi16(input0, cst); 89 input1 = _mm_madd_epi16(input1, cst); 90 input2 = _mm_madd_epi16(input2, cst); 91 input3 = _mm_madd_epi16(input3, cst); 92 93 input0 = _mm_add_epi32(input0, rounding); 94 input1 = _mm_add_epi32(input1, rounding); 95 input2 = _mm_add_epi32(input2, rounding); 96 input3 = _mm_add_epi32(input3, rounding); 97 98 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); 99 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); 100 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); 101 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); 102 103 // Stage 2 104 input0 = _mm_packs_epi32(input0, input2); 105 input1 = _mm_packs_epi32(input1, input3); 106 107 // Transpose 108 input2 = _mm_unpacklo_epi16(input0, input1); 109 input3 = _mm_unpackhi_epi16(input0, input1); 110 input0 = _mm_unpacklo_epi32(input2, input3); 111 input1 = _mm_unpackhi_epi32(input2, input3); 112 113 // Switch column2, column 3, and then, we got: 114 // input2: column1, column 0; input3: column2, column 3. 115 input1 = _mm_shuffle_epi32(input1, 0x4e); 116 input2 = _mm_add_epi16(input0, input1); 117 input3 = _mm_sub_epi16(input0, input1); 118 119 // Final round and shift 120 input2 = _mm_add_epi16(input2, eight); 121 input3 = _mm_add_epi16(input3, eight); 122 123 input2 = _mm_srai_epi16(input2, 4); 124 input3 = _mm_srai_epi16(input3, 4); 125 126 // Reconstruction and Store 127 { 128 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); 129 __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); 130 d0 = _mm_unpacklo_epi32(d0, 131 _mm_cvtsi32_si128(*(const int *) (dest + stride))); 132 d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128( 133 *(const int *) (dest + stride * 3)), d2); 134 d0 = _mm_unpacklo_epi8(d0, zero); 135 d2 = _mm_unpacklo_epi8(d2, zero); 136 d0 = _mm_add_epi16(d0, input2); 137 d2 = _mm_add_epi16(d2, input3); 138 d0 = _mm_packus_epi16(d0, d2); 139 // store input0 140 *(int *)dest = _mm_cvtsi128_si32(d0); 141 // store input1 142 d0 = _mm_srli_si128(d0, 4); 143 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); 144 // store input2 145 d0 = _mm_srli_si128(d0, 4); 146 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); 147 // store input3 148 d0 = _mm_srli_si128(d0, 4); 149 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); 150 } 151} 152 153void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 154 __m128i dc_value; 155 const __m128i zero = _mm_setzero_si128(); 156 int a; 157 158 a = dct_const_round_shift(input[0] * cospi_16_64); 159 a = dct_const_round_shift(a * cospi_16_64); 160 a = ROUND_POWER_OF_TWO(a, 4); 161 162 dc_value = _mm_set1_epi16(a); 163 164 RECON_AND_STORE4X4(dest, dc_value); 165 RECON_AND_STORE4X4(dest, dc_value); 166 RECON_AND_STORE4X4(dest, dc_value); 167 RECON_AND_STORE4X4(dest, dc_value); 168} 169 170static INLINE void transpose_4x4(__m128i *res) { 171 const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); 172 const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); 173 174 res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); 175 res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); 176} 177 178static void idct4_sse2(__m128i *in) { 179 const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); 180 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 181 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 182 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 183 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 184 __m128i u[8], v[8]; 185 186 transpose_4x4(in); 187 // stage 1 188 u[0] = _mm_unpacklo_epi16(in[0], in[1]); 189 u[1] = _mm_unpackhi_epi16(in[0], in[1]); 190 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 191 v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 192 v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); 193 v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24); 194 195 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 196 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 197 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 198 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 199 200 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 201 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 202 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 203 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 204 205 u[0] = _mm_packs_epi32(v[0], v[1]); 206 u[1] = _mm_packs_epi32(v[3], v[2]); 207 208 // stage 2 209 in[0] = _mm_add_epi16(u[0], u[1]); 210 in[1] = _mm_sub_epi16(u[0], u[1]); 211 in[1] = _mm_shuffle_epi32(in[1], 0x4E); 212} 213 214static void iadst4_sse2(__m128i *in) { 215 const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); 216 const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); 217 const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); 218 const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); 219 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9); 220 const __m128i kZero = _mm_set1_epi16(0); 221 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 222 __m128i u[8], v[8], in7; 223 224 transpose_4x4(in); 225 in7 = _mm_srli_si128(in[1], 8); 226 in7 = _mm_add_epi16(in7, in[0]); 227 in7 = _mm_sub_epi16(in7, in[1]); 228 229 u[0] = _mm_unpacklo_epi16(in[0], in[1]); 230 u[1] = _mm_unpackhi_epi16(in[0], in[1]); 231 u[2] = _mm_unpacklo_epi16(in7, kZero); 232 u[3] = _mm_unpackhi_epi16(in[0], kZero); 233 234 v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 235 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 236 v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2 237 v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4 238 v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6 239 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2 240 241 u[0] = _mm_add_epi32(v[0], v[1]); 242 u[1] = _mm_add_epi32(v[3], v[4]); 243 u[2] = v[2]; 244 u[3] = _mm_add_epi32(u[0], u[1]); 245 u[4] = _mm_slli_epi32(v[5], 2); 246 u[5] = _mm_add_epi32(u[3], v[5]); 247 u[6] = _mm_sub_epi32(u[5], u[4]); 248 249 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 250 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 251 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 252 v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 253 254 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 255 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 256 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 257 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 258 259 in[0] = _mm_packs_epi32(u[0], u[1]); 260 in[1] = _mm_packs_epi32(u[2], u[3]); 261} 262 263void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, 264 int tx_type) { 265 __m128i in[2]; 266 const __m128i zero = _mm_setzero_si128(); 267 const __m128i eight = _mm_set1_epi16(8); 268 269 in[0]= _mm_loadu_si128((const __m128i *)(input)); 270 in[1]= _mm_loadu_si128((const __m128i *)(input + 8)); 271 272 switch (tx_type) { 273 case 0: // DCT_DCT 274 idct4_sse2(in); 275 idct4_sse2(in); 276 break; 277 case 1: // ADST_DCT 278 idct4_sse2(in); 279 iadst4_sse2(in); 280 break; 281 case 2: // DCT_ADST 282 iadst4_sse2(in); 283 idct4_sse2(in); 284 break; 285 case 3: // ADST_ADST 286 iadst4_sse2(in); 287 iadst4_sse2(in); 288 break; 289 default: 290 assert(0); 291 break; 292 } 293 294 // Final round and shift 295 in[0] = _mm_add_epi16(in[0], eight); 296 in[1] = _mm_add_epi16(in[1], eight); 297 298 in[0] = _mm_srai_epi16(in[0], 4); 299 in[1] = _mm_srai_epi16(in[1], 4); 300 301 // Reconstruction and Store 302 { 303 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); 304 __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); 305 d0 = _mm_unpacklo_epi32(d0, 306 _mm_cvtsi32_si128(*(const int *) (dest + stride))); 307 d2 = _mm_unpacklo_epi32(d2, _mm_cvtsi32_si128( 308 *(const int *) (dest + stride * 3))); 309 d0 = _mm_unpacklo_epi8(d0, zero); 310 d2 = _mm_unpacklo_epi8(d2, zero); 311 d0 = _mm_add_epi16(d0, in[0]); 312 d2 = _mm_add_epi16(d2, in[1]); 313 d0 = _mm_packus_epi16(d0, d2); 314 // store result[0] 315 *(int *)dest = _mm_cvtsi128_si32(d0); 316 // store result[1] 317 d0 = _mm_srli_si128(d0, 4); 318 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); 319 // store result[2] 320 d0 = _mm_srli_si128(d0, 4); 321 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); 322 // store result[3] 323 d0 = _mm_srli_si128(d0, 4); 324 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); 325 } 326} 327 328#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ 329 out0, out1, out2, out3, out4, out5, out6, out7) \ 330 { \ 331 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ 332 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ 333 const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ 334 const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ 335 const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ 336 const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ 337 const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ 338 const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ 339 \ 340 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ 341 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ 342 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ 343 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ 344 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ 345 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ 346 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ 347 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ 348 \ 349 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ 350 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ 351 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ 352 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ 353 out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ 354 out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ 355 out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ 356 out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ 357 } 358 359#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \ 360 out0, out1, out2, out3) \ 361 { \ 362 const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \ 363 const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \ 364 const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \ 365 const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \ 366 \ 367 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ 368 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ 369 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ 370 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ 371 \ 372 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ 373 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ 374 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ 375 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ 376 } 377 378#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \ 379 { \ 380 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ 381 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ 382 out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ 383 out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ 384 } 385 386// Define Macro for multiplying elements by constants and adding them together. 387#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \ 388 cst0, cst1, cst2, cst3, res0, res1, res2, res3) \ 389 { \ 390 tmp0 = _mm_madd_epi16(lo_0, cst0); \ 391 tmp1 = _mm_madd_epi16(hi_0, cst0); \ 392 tmp2 = _mm_madd_epi16(lo_0, cst1); \ 393 tmp3 = _mm_madd_epi16(hi_0, cst1); \ 394 tmp4 = _mm_madd_epi16(lo_1, cst2); \ 395 tmp5 = _mm_madd_epi16(hi_1, cst2); \ 396 tmp6 = _mm_madd_epi16(lo_1, cst3); \ 397 tmp7 = _mm_madd_epi16(hi_1, cst3); \ 398 \ 399 tmp0 = _mm_add_epi32(tmp0, rounding); \ 400 tmp1 = _mm_add_epi32(tmp1, rounding); \ 401 tmp2 = _mm_add_epi32(tmp2, rounding); \ 402 tmp3 = _mm_add_epi32(tmp3, rounding); \ 403 tmp4 = _mm_add_epi32(tmp4, rounding); \ 404 tmp5 = _mm_add_epi32(tmp5, rounding); \ 405 tmp6 = _mm_add_epi32(tmp6, rounding); \ 406 tmp7 = _mm_add_epi32(tmp7, rounding); \ 407 \ 408 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 409 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 410 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 411 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 412 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ 413 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ 414 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ 415 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ 416 \ 417 res0 = _mm_packs_epi32(tmp0, tmp1); \ 418 res1 = _mm_packs_epi32(tmp2, tmp3); \ 419 res2 = _mm_packs_epi32(tmp4, tmp5); \ 420 res3 = _mm_packs_epi32(tmp6, tmp7); \ 421 } 422 423#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \ 424 { \ 425 tmp0 = _mm_madd_epi16(lo_0, cst0); \ 426 tmp1 = _mm_madd_epi16(hi_0, cst0); \ 427 tmp2 = _mm_madd_epi16(lo_0, cst1); \ 428 tmp3 = _mm_madd_epi16(hi_0, cst1); \ 429 \ 430 tmp0 = _mm_add_epi32(tmp0, rounding); \ 431 tmp1 = _mm_add_epi32(tmp1, rounding); \ 432 tmp2 = _mm_add_epi32(tmp2, rounding); \ 433 tmp3 = _mm_add_epi32(tmp3, rounding); \ 434 \ 435 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 436 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 437 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 438 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 439 \ 440 res0 = _mm_packs_epi32(tmp0, tmp1); \ 441 res1 = _mm_packs_epi32(tmp2, tmp3); \ 442 } 443 444#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \ 445 out0, out1, out2, out3, out4, out5, out6, out7) \ 446 { \ 447 /* Stage1 */ \ 448 { \ 449 const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ 450 const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ 451 const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ 452 const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ 453 \ 454 MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \ 455 stg1_1, stg1_2, stg1_3, stp1_4, \ 456 stp1_7, stp1_5, stp1_6) \ 457 } \ 458 \ 459 /* Stage2 */ \ 460 { \ 461 const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ 462 const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ 463 const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ 464 const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ 465 \ 466 MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \ 467 stg2_1, stg2_2, stg2_3, stp2_0, \ 468 stp2_1, stp2_2, stp2_3) \ 469 \ 470 stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ 471 stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ 472 stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ 473 stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ 474 } \ 475 \ 476 /* Stage3 */ \ 477 { \ 478 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 479 const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 480 \ 481 stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ 482 stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ 483 stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ 484 stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ 485 \ 486 tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ 487 tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ 488 tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ 489 tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ 490 \ 491 tmp0 = _mm_add_epi32(tmp0, rounding); \ 492 tmp1 = _mm_add_epi32(tmp1, rounding); \ 493 tmp2 = _mm_add_epi32(tmp2, rounding); \ 494 tmp3 = _mm_add_epi32(tmp3, rounding); \ 495 \ 496 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 497 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 498 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 499 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 500 \ 501 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 502 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 503 } \ 504 \ 505 /* Stage4 */ \ 506 out0 = _mm_adds_epi16(stp1_0, stp2_7); \ 507 out1 = _mm_adds_epi16(stp1_1, stp1_6); \ 508 out2 = _mm_adds_epi16(stp1_2, stp1_5); \ 509 out3 = _mm_adds_epi16(stp1_3, stp2_4); \ 510 out4 = _mm_subs_epi16(stp1_3, stp2_4); \ 511 out5 = _mm_subs_epi16(stp1_2, stp1_5); \ 512 out6 = _mm_subs_epi16(stp1_1, stp1_6); \ 513 out7 = _mm_subs_epi16(stp1_0, stp2_7); \ 514 } 515 516void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 517 const __m128i zero = _mm_setzero_si128(); 518 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 519 const __m128i final_rounding = _mm_set1_epi16(1<<4); 520 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 521 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 522 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 523 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 524 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 525 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 526 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 527 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 528 529 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 530 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 531 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 532 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 533 int i; 534 535 // Load input data. 536 in0 = _mm_load_si128((const __m128i *)input); 537 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); 538 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); 539 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); 540 in4 = _mm_load_si128((const __m128i *)(input + 8 * 4)); 541 in5 = _mm_load_si128((const __m128i *)(input + 8 * 5)); 542 in6 = _mm_load_si128((const __m128i *)(input + 8 * 6)); 543 in7 = _mm_load_si128((const __m128i *)(input + 8 * 7)); 544 545 // 2-D 546 for (i = 0; i < 2; i++) { 547 // 8x8 Transpose is copied from vp9_fdct8x8_sse2() 548 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, 549 in0, in1, in2, in3, in4, in5, in6, in7); 550 551 // 4-stage 1D idct8x8 552 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, 553 in0, in1, in2, in3, in4, in5, in6, in7); 554 } 555 556 // Final rounding and shift 557 in0 = _mm_adds_epi16(in0, final_rounding); 558 in1 = _mm_adds_epi16(in1, final_rounding); 559 in2 = _mm_adds_epi16(in2, final_rounding); 560 in3 = _mm_adds_epi16(in3, final_rounding); 561 in4 = _mm_adds_epi16(in4, final_rounding); 562 in5 = _mm_adds_epi16(in5, final_rounding); 563 in6 = _mm_adds_epi16(in6, final_rounding); 564 in7 = _mm_adds_epi16(in7, final_rounding); 565 566 in0 = _mm_srai_epi16(in0, 5); 567 in1 = _mm_srai_epi16(in1, 5); 568 in2 = _mm_srai_epi16(in2, 5); 569 in3 = _mm_srai_epi16(in3, 5); 570 in4 = _mm_srai_epi16(in4, 5); 571 in5 = _mm_srai_epi16(in5, 5); 572 in6 = _mm_srai_epi16(in6, 5); 573 in7 = _mm_srai_epi16(in7, 5); 574 575 RECON_AND_STORE(dest, in0); 576 RECON_AND_STORE(dest, in1); 577 RECON_AND_STORE(dest, in2); 578 RECON_AND_STORE(dest, in3); 579 RECON_AND_STORE(dest, in4); 580 RECON_AND_STORE(dest, in5); 581 RECON_AND_STORE(dest, in6); 582 RECON_AND_STORE(dest, in7); 583} 584 585void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 586 __m128i dc_value; 587 const __m128i zero = _mm_setzero_si128(); 588 int a; 589 590 a = dct_const_round_shift(input[0] * cospi_16_64); 591 a = dct_const_round_shift(a * cospi_16_64); 592 a = ROUND_POWER_OF_TWO(a, 5); 593 594 dc_value = _mm_set1_epi16(a); 595 596 RECON_AND_STORE(dest, dc_value); 597 RECON_AND_STORE(dest, dc_value); 598 RECON_AND_STORE(dest, dc_value); 599 RECON_AND_STORE(dest, dc_value); 600 RECON_AND_STORE(dest, dc_value); 601 RECON_AND_STORE(dest, dc_value); 602 RECON_AND_STORE(dest, dc_value); 603 RECON_AND_STORE(dest, dc_value); 604} 605 606static void idct8_sse2(__m128i *in) { 607 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 608 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 609 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 610 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 611 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 612 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 613 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 614 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 615 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 616 617 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 618 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 619 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 620 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 621 622 // 8x8 Transpose is copied from vp9_fdct8x8_sse2() 623 TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], 624 in0, in1, in2, in3, in4, in5, in6, in7); 625 626 // 4-stage 1D idct8x8 627 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, 628 in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]); 629} 630 631static void iadst8_sse2(__m128i *in) { 632 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); 633 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); 634 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); 635 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); 636 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); 637 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); 638 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); 639 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); 640 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 641 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 642 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 643 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 644 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 645 const __m128i k__const_0 = _mm_set1_epi16(0); 646 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 647 648 __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; 649 __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; 650 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; 651 __m128i s0, s1, s2, s3, s4, s5, s6, s7; 652 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 653 654 // transpose 655 array_transpose_8x8(in, in); 656 657 // properly aligned for butterfly input 658 in0 = in[7]; 659 in1 = in[0]; 660 in2 = in[5]; 661 in3 = in[2]; 662 in4 = in[3]; 663 in5 = in[4]; 664 in6 = in[1]; 665 in7 = in[6]; 666 667 // column transformation 668 // stage 1 669 // interleave and multiply/add into 32-bit integer 670 s0 = _mm_unpacklo_epi16(in0, in1); 671 s1 = _mm_unpackhi_epi16(in0, in1); 672 s2 = _mm_unpacklo_epi16(in2, in3); 673 s3 = _mm_unpackhi_epi16(in2, in3); 674 s4 = _mm_unpacklo_epi16(in4, in5); 675 s5 = _mm_unpackhi_epi16(in4, in5); 676 s6 = _mm_unpacklo_epi16(in6, in7); 677 s7 = _mm_unpackhi_epi16(in6, in7); 678 679 u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); 680 u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); 681 u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); 682 u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); 683 u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); 684 u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); 685 u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); 686 u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); 687 u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); 688 u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); 689 u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); 690 u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); 691 u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); 692 u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); 693 u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); 694 u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); 695 696 // addition 697 w0 = _mm_add_epi32(u0, u8); 698 w1 = _mm_add_epi32(u1, u9); 699 w2 = _mm_add_epi32(u2, u10); 700 w3 = _mm_add_epi32(u3, u11); 701 w4 = _mm_add_epi32(u4, u12); 702 w5 = _mm_add_epi32(u5, u13); 703 w6 = _mm_add_epi32(u6, u14); 704 w7 = _mm_add_epi32(u7, u15); 705 w8 = _mm_sub_epi32(u0, u8); 706 w9 = _mm_sub_epi32(u1, u9); 707 w10 = _mm_sub_epi32(u2, u10); 708 w11 = _mm_sub_epi32(u3, u11); 709 w12 = _mm_sub_epi32(u4, u12); 710 w13 = _mm_sub_epi32(u5, u13); 711 w14 = _mm_sub_epi32(u6, u14); 712 w15 = _mm_sub_epi32(u7, u15); 713 714 // shift and rounding 715 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 716 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 717 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 718 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 719 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 720 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 721 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 722 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 723 v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); 724 v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); 725 v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); 726 v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); 727 v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); 728 v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); 729 v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); 730 v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); 731 732 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 733 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 734 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 735 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 736 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 737 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 738 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 739 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 740 u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); 741 u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); 742 u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); 743 u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); 744 u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); 745 u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); 746 u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); 747 u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); 748 749 // back to 16-bit and pack 8 integers into __m128i 750 in[0] = _mm_packs_epi32(u0, u1); 751 in[1] = _mm_packs_epi32(u2, u3); 752 in[2] = _mm_packs_epi32(u4, u5); 753 in[3] = _mm_packs_epi32(u6, u7); 754 in[4] = _mm_packs_epi32(u8, u9); 755 in[5] = _mm_packs_epi32(u10, u11); 756 in[6] = _mm_packs_epi32(u12, u13); 757 in[7] = _mm_packs_epi32(u14, u15); 758 759 // stage 2 760 s0 = _mm_add_epi16(in[0], in[2]); 761 s1 = _mm_add_epi16(in[1], in[3]); 762 s2 = _mm_sub_epi16(in[0], in[2]); 763 s3 = _mm_sub_epi16(in[1], in[3]); 764 u0 = _mm_unpacklo_epi16(in[4], in[5]); 765 u1 = _mm_unpackhi_epi16(in[4], in[5]); 766 u2 = _mm_unpacklo_epi16(in[6], in[7]); 767 u3 = _mm_unpackhi_epi16(in[6], in[7]); 768 769 v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); 770 v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); 771 v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); 772 v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); 773 v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); 774 v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); 775 v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); 776 v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); 777 778 w0 = _mm_add_epi32(v0, v4); 779 w1 = _mm_add_epi32(v1, v5); 780 w2 = _mm_add_epi32(v2, v6); 781 w3 = _mm_add_epi32(v3, v7); 782 w4 = _mm_sub_epi32(v0, v4); 783 w5 = _mm_sub_epi32(v1, v5); 784 w6 = _mm_sub_epi32(v2, v6); 785 w7 = _mm_sub_epi32(v3, v7); 786 787 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 788 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 789 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 790 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 791 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 792 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 793 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 794 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 795 796 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 797 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 798 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 799 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 800 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 801 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 802 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 803 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 804 805 // back to 16-bit intergers 806 s4 = _mm_packs_epi32(u0, u1); 807 s5 = _mm_packs_epi32(u2, u3); 808 s6 = _mm_packs_epi32(u4, u5); 809 s7 = _mm_packs_epi32(u6, u7); 810 811 // stage 3 812 u0 = _mm_unpacklo_epi16(s2, s3); 813 u1 = _mm_unpackhi_epi16(s2, s3); 814 u2 = _mm_unpacklo_epi16(s6, s7); 815 u3 = _mm_unpackhi_epi16(s6, s7); 816 817 v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); 818 v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); 819 v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); 820 v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); 821 v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); 822 v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); 823 v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); 824 v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); 825 826 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 827 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 828 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 829 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 830 u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); 831 u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); 832 u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); 833 u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); 834 835 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 836 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 837 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 838 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 839 v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); 840 v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); 841 v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); 842 v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); 843 844 s2 = _mm_packs_epi32(v0, v1); 845 s3 = _mm_packs_epi32(v2, v3); 846 s6 = _mm_packs_epi32(v4, v5); 847 s7 = _mm_packs_epi32(v6, v7); 848 849 in[0] = s0; 850 in[1] = _mm_sub_epi16(k__const_0, s4); 851 in[2] = s6; 852 in[3] = _mm_sub_epi16(k__const_0, s2); 853 in[4] = s3; 854 in[5] = _mm_sub_epi16(k__const_0, s7); 855 in[6] = s5; 856 in[7] = _mm_sub_epi16(k__const_0, s1); 857} 858 859 860void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, 861 int tx_type) { 862 __m128i in[8]; 863 const __m128i zero = _mm_setzero_si128(); 864 const __m128i final_rounding = _mm_set1_epi16(1<<4); 865 866 // load input data 867 in[0] = _mm_load_si128((const __m128i *)input); 868 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1)); 869 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2)); 870 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3)); 871 in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4)); 872 in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5)); 873 in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6)); 874 in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7)); 875 876 switch (tx_type) { 877 case 0: // DCT_DCT 878 idct8_sse2(in); 879 idct8_sse2(in); 880 break; 881 case 1: // ADST_DCT 882 idct8_sse2(in); 883 iadst8_sse2(in); 884 break; 885 case 2: // DCT_ADST 886 iadst8_sse2(in); 887 idct8_sse2(in); 888 break; 889 case 3: // ADST_ADST 890 iadst8_sse2(in); 891 iadst8_sse2(in); 892 break; 893 default: 894 assert(0); 895 break; 896 } 897 898 // Final rounding and shift 899 in[0] = _mm_adds_epi16(in[0], final_rounding); 900 in[1] = _mm_adds_epi16(in[1], final_rounding); 901 in[2] = _mm_adds_epi16(in[2], final_rounding); 902 in[3] = _mm_adds_epi16(in[3], final_rounding); 903 in[4] = _mm_adds_epi16(in[4], final_rounding); 904 in[5] = _mm_adds_epi16(in[5], final_rounding); 905 in[6] = _mm_adds_epi16(in[6], final_rounding); 906 in[7] = _mm_adds_epi16(in[7], final_rounding); 907 908 in[0] = _mm_srai_epi16(in[0], 5); 909 in[1] = _mm_srai_epi16(in[1], 5); 910 in[2] = _mm_srai_epi16(in[2], 5); 911 in[3] = _mm_srai_epi16(in[3], 5); 912 in[4] = _mm_srai_epi16(in[4], 5); 913 in[5] = _mm_srai_epi16(in[5], 5); 914 in[6] = _mm_srai_epi16(in[6], 5); 915 in[7] = _mm_srai_epi16(in[7], 5); 916 917 RECON_AND_STORE(dest, in[0]); 918 RECON_AND_STORE(dest, in[1]); 919 RECON_AND_STORE(dest, in[2]); 920 RECON_AND_STORE(dest, in[3]); 921 RECON_AND_STORE(dest, in[4]); 922 RECON_AND_STORE(dest, in[5]); 923 RECON_AND_STORE(dest, in[6]); 924 RECON_AND_STORE(dest, in[7]); 925} 926 927void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 928 const __m128i zero = _mm_setzero_si128(); 929 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 930 const __m128i final_rounding = _mm_set1_epi16(1<<4); 931 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 932 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 933 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 934 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 935 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 936 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 937 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 938 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 939 const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 940 941 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 942 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 943 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 944 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 945 946 // Rows. Load 4-row input data. 947 in0 = _mm_load_si128((const __m128i *)input); 948 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); 949 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); 950 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); 951 952 // 8x4 Transpose 953 TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); 954 // Stage1 955 { //NOLINT 956 const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero); 957 const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero); 958 959 tmp0 = _mm_madd_epi16(lo_17, stg1_0); 960 tmp2 = _mm_madd_epi16(lo_17, stg1_1); 961 tmp4 = _mm_madd_epi16(lo_35, stg1_2); 962 tmp6 = _mm_madd_epi16(lo_35, stg1_3); 963 964 tmp0 = _mm_add_epi32(tmp0, rounding); 965 tmp2 = _mm_add_epi32(tmp2, rounding); 966 tmp4 = _mm_add_epi32(tmp4, rounding); 967 tmp6 = _mm_add_epi32(tmp6, rounding); 968 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 969 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 970 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 971 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 972 973 stp1_4 = _mm_packs_epi32(tmp0, tmp2); 974 stp1_5 = _mm_packs_epi32(tmp4, tmp6); 975 } 976 977 // Stage2 978 { //NOLINT 979 const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero); 980 const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero); 981 982 tmp0 = _mm_madd_epi16(lo_04, stg2_0); 983 tmp2 = _mm_madd_epi16(lo_04, stg2_1); 984 tmp4 = _mm_madd_epi16(lo_26, stg2_2); 985 tmp6 = _mm_madd_epi16(lo_26, stg2_3); 986 987 tmp0 = _mm_add_epi32(tmp0, rounding); 988 tmp2 = _mm_add_epi32(tmp2, rounding); 989 tmp4 = _mm_add_epi32(tmp4, rounding); 990 tmp6 = _mm_add_epi32(tmp6, rounding); 991 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 992 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 993 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 994 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 995 996 stp2_0 = _mm_packs_epi32(tmp0, tmp2); 997 stp2_2 = _mm_packs_epi32(tmp6, tmp4); 998 999 tmp0 = _mm_adds_epi16(stp1_4, stp1_5); 1000 tmp1 = _mm_subs_epi16(stp1_4, stp1_5); 1001 1002 stp2_4 = tmp0; 1003 stp2_5 = _mm_unpacklo_epi64(tmp1, zero); 1004 stp2_6 = _mm_unpackhi_epi64(tmp1, zero); 1005 } 1006 1007 // Stage3 1008 { //NOLINT 1009 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); 1010 1011 tmp4 = _mm_adds_epi16(stp2_0, stp2_2); 1012 tmp6 = _mm_subs_epi16(stp2_0, stp2_2); 1013 1014 stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4); 1015 stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4); 1016 1017 tmp0 = _mm_madd_epi16(lo_56, stg3_0); 1018 tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 1019 1020 tmp0 = _mm_add_epi32(tmp0, rounding); 1021 tmp2 = _mm_add_epi32(tmp2, rounding); 1022 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 1023 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 1024 1025 stp1_5 = _mm_packs_epi32(tmp0, tmp2); 1026 } 1027 1028 // Stage4 1029 tmp0 = _mm_adds_epi16(stp1_3, stp2_4); 1030 tmp1 = _mm_adds_epi16(stp1_2, stp1_5); 1031 tmp2 = _mm_subs_epi16(stp1_3, stp2_4); 1032 tmp3 = _mm_subs_epi16(stp1_2, stp1_5); 1033 1034 TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) 1035 1036 IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, 1037 in0, in1, in2, in3, in4, in5, in6, in7); 1038 // Final rounding and shift 1039 in0 = _mm_adds_epi16(in0, final_rounding); 1040 in1 = _mm_adds_epi16(in1, final_rounding); 1041 in2 = _mm_adds_epi16(in2, final_rounding); 1042 in3 = _mm_adds_epi16(in3, final_rounding); 1043 in4 = _mm_adds_epi16(in4, final_rounding); 1044 in5 = _mm_adds_epi16(in5, final_rounding); 1045 in6 = _mm_adds_epi16(in6, final_rounding); 1046 in7 = _mm_adds_epi16(in7, final_rounding); 1047 1048 in0 = _mm_srai_epi16(in0, 5); 1049 in1 = _mm_srai_epi16(in1, 5); 1050 in2 = _mm_srai_epi16(in2, 5); 1051 in3 = _mm_srai_epi16(in3, 5); 1052 in4 = _mm_srai_epi16(in4, 5); 1053 in5 = _mm_srai_epi16(in5, 5); 1054 in6 = _mm_srai_epi16(in6, 5); 1055 in7 = _mm_srai_epi16(in7, 5); 1056 1057 RECON_AND_STORE(dest, in0); 1058 RECON_AND_STORE(dest, in1); 1059 RECON_AND_STORE(dest, in2); 1060 RECON_AND_STORE(dest, in3); 1061 RECON_AND_STORE(dest, in4); 1062 RECON_AND_STORE(dest, in5); 1063 RECON_AND_STORE(dest, in6); 1064 RECON_AND_STORE(dest, in7); 1065} 1066 1067#define IDCT16 \ 1068 /* Stage2 */ \ 1069 { \ 1070 const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \ 1071 const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \ 1072 const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \ 1073 const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \ 1074 const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \ 1075 const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \ 1076 const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \ 1077 const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \ 1078 \ 1079 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \ 1080 stg2_0, stg2_1, stg2_2, stg2_3, \ 1081 stp2_8, stp2_15, stp2_9, stp2_14) \ 1082 \ 1083 MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \ 1084 stg2_4, stg2_5, stg2_6, stg2_7, \ 1085 stp2_10, stp2_13, stp2_11, stp2_12) \ 1086 } \ 1087 \ 1088 /* Stage3 */ \ 1089 { \ 1090 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \ 1091 const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \ 1092 const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \ 1093 const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \ 1094 \ 1095 MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \ 1096 stg3_0, stg3_1, stg3_2, stg3_3, \ 1097 stp1_4, stp1_7, stp1_5, stp1_6) \ 1098 \ 1099 stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ 1100 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ 1101 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ 1102 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ 1103 \ 1104 stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ 1105 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ 1106 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ 1107 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ 1108 } \ 1109 \ 1110 /* Stage4 */ \ 1111 { \ 1112 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \ 1113 const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \ 1114 const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \ 1115 const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \ 1116 \ 1117 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ 1118 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ 1119 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 1120 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 1121 \ 1122 MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \ 1123 stg4_0, stg4_1, stg4_2, stg4_3, \ 1124 stp2_0, stp2_1, stp2_2, stp2_3) \ 1125 \ 1126 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ 1127 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ 1128 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ 1129 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ 1130 \ 1131 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ 1132 stg4_4, stg4_5, stg4_6, stg4_7, \ 1133 stp2_9, stp2_14, stp2_10, stp2_13) \ 1134 } \ 1135 \ 1136 /* Stage5 */ \ 1137 { \ 1138 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 1139 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 1140 \ 1141 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ 1142 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ 1143 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ 1144 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ 1145 \ 1146 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 1147 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 1148 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 1149 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 1150 \ 1151 tmp0 = _mm_add_epi32(tmp0, rounding); \ 1152 tmp1 = _mm_add_epi32(tmp1, rounding); \ 1153 tmp2 = _mm_add_epi32(tmp2, rounding); \ 1154 tmp3 = _mm_add_epi32(tmp3, rounding); \ 1155 \ 1156 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 1157 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 1158 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 1159 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 1160 \ 1161 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 1162 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 1163 \ 1164 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ 1165 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 1166 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 1167 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ 1168 \ 1169 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ 1170 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 1171 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 1172 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ 1173 } \ 1174 \ 1175 /* Stage6 */ \ 1176 { \ 1177 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 1178 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 1179 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 1180 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 1181 \ 1182 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ 1183 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 1184 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 1185 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ 1186 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ 1187 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 1188 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 1189 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ 1190 \ 1191 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ 1192 stg6_0, stg4_0, stg6_0, stg4_0, \ 1193 stp2_10, stp2_13, stp2_11, stp2_12) \ 1194 } 1195 1196#define IDCT16_10 \ 1197 /* Stage2 */ \ 1198 { \ 1199 const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \ 1200 const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \ 1201 const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \ 1202 const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \ 1203 \ 1204 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \ 1205 stg2_0, stg2_1, stg2_6, stg2_7, \ 1206 stp1_8_0, stp1_15, stp1_11, stp1_12_0) \ 1207 } \ 1208 \ 1209 /* Stage3 */ \ 1210 { \ 1211 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \ 1212 const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \ 1213 \ 1214 MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \ 1215 stg3_0, stg3_1, \ 1216 stp2_4, stp2_7) \ 1217 \ 1218 stp1_9 = stp1_8_0; \ 1219 stp1_10 = stp1_11; \ 1220 \ 1221 stp1_13 = stp1_12_0; \ 1222 stp1_14 = stp1_15; \ 1223 } \ 1224 \ 1225 /* Stage4 */ \ 1226 { \ 1227 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \ 1228 const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \ 1229 \ 1230 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ 1231 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ 1232 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 1233 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 1234 \ 1235 MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \ 1236 stg4_0, stg4_1, \ 1237 stp1_0, stp1_1) \ 1238 stp2_5 = stp2_4; \ 1239 stp2_6 = stp2_7; \ 1240 \ 1241 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ 1242 stg4_4, stg4_5, stg4_6, stg4_7, \ 1243 stp2_9, stp2_14, stp2_10, stp2_13) \ 1244 } \ 1245 \ 1246 /* Stage5 */ \ 1247 { \ 1248 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 1249 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 1250 \ 1251 stp1_2 = stp1_1; \ 1252 stp1_3 = stp1_0; \ 1253 \ 1254 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 1255 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 1256 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 1257 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 1258 \ 1259 tmp0 = _mm_add_epi32(tmp0, rounding); \ 1260 tmp1 = _mm_add_epi32(tmp1, rounding); \ 1261 tmp2 = _mm_add_epi32(tmp2, rounding); \ 1262 tmp3 = _mm_add_epi32(tmp3, rounding); \ 1263 \ 1264 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 1265 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 1266 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 1267 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 1268 \ 1269 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 1270 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 1271 \ 1272 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ 1273 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 1274 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 1275 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ 1276 \ 1277 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ 1278 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 1279 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 1280 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ 1281 } \ 1282 \ 1283 /* Stage6 */ \ 1284 { \ 1285 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 1286 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 1287 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 1288 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 1289 \ 1290 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ 1291 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 1292 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 1293 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ 1294 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ 1295 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 1296 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 1297 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ 1298 \ 1299 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ 1300 stg6_0, stg4_0, stg6_0, stg4_0, \ 1301 stp2_10, stp2_13, stp2_11, stp2_12) \ 1302 } 1303 1304void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, 1305 int stride) { 1306 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 1307 const __m128i final_rounding = _mm_set1_epi16(1<<5); 1308 const __m128i zero = _mm_setzero_si128(); 1309 1310 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 1311 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 1312 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 1313 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 1314 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 1315 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 1316 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 1317 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 1318 1319 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1320 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 1321 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1322 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 1323 1324 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 1325 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1326 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1327 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 1328 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1329 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 1330 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1331 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1332 1333 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1334 1335 __m128i in[16], l[16], r[16], *curr1; 1336 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 1337 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 1338 stp1_8_0, stp1_12_0; 1339 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 1340 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; 1341 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 1342 int i; 1343 1344 curr1 = l; 1345 for (i = 0; i < 2; i++) { 1346 // 1-D idct 1347 1348 // Load input data. 1349 in[0] = _mm_load_si128((const __m128i *)input); 1350 in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1)); 1351 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); 1352 in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3)); 1353 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); 1354 in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5)); 1355 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); 1356 in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7)); 1357 in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8)); 1358 in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9)); 1359 in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10)); 1360 in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11)); 1361 in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12)); 1362 in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13)); 1363 in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14)); 1364 in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15)); 1365 1366 array_transpose_8x8(in, in); 1367 array_transpose_8x8(in+8, in+8); 1368 1369 IDCT16 1370 1371 // Stage7 1372 curr1[0] = _mm_add_epi16(stp2_0, stp1_15); 1373 curr1[1] = _mm_add_epi16(stp2_1, stp1_14); 1374 curr1[2] = _mm_add_epi16(stp2_2, stp2_13); 1375 curr1[3] = _mm_add_epi16(stp2_3, stp2_12); 1376 curr1[4] = _mm_add_epi16(stp2_4, stp2_11); 1377 curr1[5] = _mm_add_epi16(stp2_5, stp2_10); 1378 curr1[6] = _mm_add_epi16(stp2_6, stp1_9); 1379 curr1[7] = _mm_add_epi16(stp2_7, stp1_8); 1380 curr1[8] = _mm_sub_epi16(stp2_7, stp1_8); 1381 curr1[9] = _mm_sub_epi16(stp2_6, stp1_9); 1382 curr1[10] = _mm_sub_epi16(stp2_5, stp2_10); 1383 curr1[11] = _mm_sub_epi16(stp2_4, stp2_11); 1384 curr1[12] = _mm_sub_epi16(stp2_3, stp2_12); 1385 curr1[13] = _mm_sub_epi16(stp2_2, stp2_13); 1386 curr1[14] = _mm_sub_epi16(stp2_1, stp1_14); 1387 curr1[15] = _mm_sub_epi16(stp2_0, stp1_15); 1388 1389 curr1 = r; 1390 input += 128; 1391 } 1392 for (i = 0; i < 2; i++) { 1393 // 1-D idct 1394 array_transpose_8x8(l+i*8, in); 1395 array_transpose_8x8(r+i*8, in+8); 1396 1397 IDCT16 1398 1399 // 2-D 1400 in[0] = _mm_add_epi16(stp2_0, stp1_15); 1401 in[1] = _mm_add_epi16(stp2_1, stp1_14); 1402 in[2] = _mm_add_epi16(stp2_2, stp2_13); 1403 in[3] = _mm_add_epi16(stp2_3, stp2_12); 1404 in[4] = _mm_add_epi16(stp2_4, stp2_11); 1405 in[5] = _mm_add_epi16(stp2_5, stp2_10); 1406 in[6] = _mm_add_epi16(stp2_6, stp1_9); 1407 in[7] = _mm_add_epi16(stp2_7, stp1_8); 1408 in[8] = _mm_sub_epi16(stp2_7, stp1_8); 1409 in[9] = _mm_sub_epi16(stp2_6, stp1_9); 1410 in[10] = _mm_sub_epi16(stp2_5, stp2_10); 1411 in[11] = _mm_sub_epi16(stp2_4, stp2_11); 1412 in[12] = _mm_sub_epi16(stp2_3, stp2_12); 1413 in[13] = _mm_sub_epi16(stp2_2, stp2_13); 1414 in[14] = _mm_sub_epi16(stp2_1, stp1_14); 1415 in[15] = _mm_sub_epi16(stp2_0, stp1_15); 1416 1417 // Final rounding and shift 1418 in[0] = _mm_adds_epi16(in[0], final_rounding); 1419 in[1] = _mm_adds_epi16(in[1], final_rounding); 1420 in[2] = _mm_adds_epi16(in[2], final_rounding); 1421 in[3] = _mm_adds_epi16(in[3], final_rounding); 1422 in[4] = _mm_adds_epi16(in[4], final_rounding); 1423 in[5] = _mm_adds_epi16(in[5], final_rounding); 1424 in[6] = _mm_adds_epi16(in[6], final_rounding); 1425 in[7] = _mm_adds_epi16(in[7], final_rounding); 1426 in[8] = _mm_adds_epi16(in[8], final_rounding); 1427 in[9] = _mm_adds_epi16(in[9], final_rounding); 1428 in[10] = _mm_adds_epi16(in[10], final_rounding); 1429 in[11] = _mm_adds_epi16(in[11], final_rounding); 1430 in[12] = _mm_adds_epi16(in[12], final_rounding); 1431 in[13] = _mm_adds_epi16(in[13], final_rounding); 1432 in[14] = _mm_adds_epi16(in[14], final_rounding); 1433 in[15] = _mm_adds_epi16(in[15], final_rounding); 1434 1435 in[0] = _mm_srai_epi16(in[0], 6); 1436 in[1] = _mm_srai_epi16(in[1], 6); 1437 in[2] = _mm_srai_epi16(in[2], 6); 1438 in[3] = _mm_srai_epi16(in[3], 6); 1439 in[4] = _mm_srai_epi16(in[4], 6); 1440 in[5] = _mm_srai_epi16(in[5], 6); 1441 in[6] = _mm_srai_epi16(in[6], 6); 1442 in[7] = _mm_srai_epi16(in[7], 6); 1443 in[8] = _mm_srai_epi16(in[8], 6); 1444 in[9] = _mm_srai_epi16(in[9], 6); 1445 in[10] = _mm_srai_epi16(in[10], 6); 1446 in[11] = _mm_srai_epi16(in[11], 6); 1447 in[12] = _mm_srai_epi16(in[12], 6); 1448 in[13] = _mm_srai_epi16(in[13], 6); 1449 in[14] = _mm_srai_epi16(in[14], 6); 1450 in[15] = _mm_srai_epi16(in[15], 6); 1451 1452 RECON_AND_STORE(dest, in[0]); 1453 RECON_AND_STORE(dest, in[1]); 1454 RECON_AND_STORE(dest, in[2]); 1455 RECON_AND_STORE(dest, in[3]); 1456 RECON_AND_STORE(dest, in[4]); 1457 RECON_AND_STORE(dest, in[5]); 1458 RECON_AND_STORE(dest, in[6]); 1459 RECON_AND_STORE(dest, in[7]); 1460 RECON_AND_STORE(dest, in[8]); 1461 RECON_AND_STORE(dest, in[9]); 1462 RECON_AND_STORE(dest, in[10]); 1463 RECON_AND_STORE(dest, in[11]); 1464 RECON_AND_STORE(dest, in[12]); 1465 RECON_AND_STORE(dest, in[13]); 1466 RECON_AND_STORE(dest, in[14]); 1467 RECON_AND_STORE(dest, in[15]); 1468 1469 dest += 8 - (stride * 16); 1470 } 1471} 1472 1473void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 1474 __m128i dc_value; 1475 const __m128i zero = _mm_setzero_si128(); 1476 int a, i; 1477 1478 a = dct_const_round_shift(input[0] * cospi_16_64); 1479 a = dct_const_round_shift(a * cospi_16_64); 1480 a = ROUND_POWER_OF_TWO(a, 6); 1481 1482 dc_value = _mm_set1_epi16(a); 1483 1484 for (i = 0; i < 2; ++i) { 1485 RECON_AND_STORE(dest, dc_value); 1486 RECON_AND_STORE(dest, dc_value); 1487 RECON_AND_STORE(dest, dc_value); 1488 RECON_AND_STORE(dest, dc_value); 1489 RECON_AND_STORE(dest, dc_value); 1490 RECON_AND_STORE(dest, dc_value); 1491 RECON_AND_STORE(dest, dc_value); 1492 RECON_AND_STORE(dest, dc_value); 1493 RECON_AND_STORE(dest, dc_value); 1494 RECON_AND_STORE(dest, dc_value); 1495 RECON_AND_STORE(dest, dc_value); 1496 RECON_AND_STORE(dest, dc_value); 1497 RECON_AND_STORE(dest, dc_value); 1498 RECON_AND_STORE(dest, dc_value); 1499 RECON_AND_STORE(dest, dc_value); 1500 RECON_AND_STORE(dest, dc_value); 1501 dest += 8 - (stride * 16); 1502 } 1503} 1504 1505static void iadst16_8col(__m128i *in) { 1506 // perform 16x16 1-D ADST for 8 columns 1507 __m128i s[16], x[16], u[32], v[32]; 1508 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); 1509 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); 1510 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); 1511 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); 1512 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); 1513 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); 1514 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); 1515 const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); 1516 const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); 1517 const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); 1518 const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); 1519 const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); 1520 const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); 1521 const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); 1522 const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); 1523 const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); 1524 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); 1525 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1526 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); 1527 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1528 const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); 1529 const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); 1530 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 1531 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1532 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 1533 const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); 1534 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 1535 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1536 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1537 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1538 const __m128i kZero = _mm_set1_epi16(0); 1539 1540 u[0] = _mm_unpacklo_epi16(in[15], in[0]); 1541 u[1] = _mm_unpackhi_epi16(in[15], in[0]); 1542 u[2] = _mm_unpacklo_epi16(in[13], in[2]); 1543 u[3] = _mm_unpackhi_epi16(in[13], in[2]); 1544 u[4] = _mm_unpacklo_epi16(in[11], in[4]); 1545 u[5] = _mm_unpackhi_epi16(in[11], in[4]); 1546 u[6] = _mm_unpacklo_epi16(in[9], in[6]); 1547 u[7] = _mm_unpackhi_epi16(in[9], in[6]); 1548 u[8] = _mm_unpacklo_epi16(in[7], in[8]); 1549 u[9] = _mm_unpackhi_epi16(in[7], in[8]); 1550 u[10] = _mm_unpacklo_epi16(in[5], in[10]); 1551 u[11] = _mm_unpackhi_epi16(in[5], in[10]); 1552 u[12] = _mm_unpacklo_epi16(in[3], in[12]); 1553 u[13] = _mm_unpackhi_epi16(in[3], in[12]); 1554 u[14] = _mm_unpacklo_epi16(in[1], in[14]); 1555 u[15] = _mm_unpackhi_epi16(in[1], in[14]); 1556 1557 v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); 1558 v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); 1559 v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); 1560 v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); 1561 v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); 1562 v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); 1563 v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); 1564 v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); 1565 v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); 1566 v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); 1567 v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); 1568 v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); 1569 v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); 1570 v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); 1571 v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); 1572 v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); 1573 v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); 1574 v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); 1575 v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); 1576 v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); 1577 v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); 1578 v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); 1579 v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); 1580 v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); 1581 v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); 1582 v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); 1583 v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); 1584 v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); 1585 v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); 1586 v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); 1587 v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); 1588 v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); 1589 1590 u[0] = _mm_add_epi32(v[0], v[16]); 1591 u[1] = _mm_add_epi32(v[1], v[17]); 1592 u[2] = _mm_add_epi32(v[2], v[18]); 1593 u[3] = _mm_add_epi32(v[3], v[19]); 1594 u[4] = _mm_add_epi32(v[4], v[20]); 1595 u[5] = _mm_add_epi32(v[5], v[21]); 1596 u[6] = _mm_add_epi32(v[6], v[22]); 1597 u[7] = _mm_add_epi32(v[7], v[23]); 1598 u[8] = _mm_add_epi32(v[8], v[24]); 1599 u[9] = _mm_add_epi32(v[9], v[25]); 1600 u[10] = _mm_add_epi32(v[10], v[26]); 1601 u[11] = _mm_add_epi32(v[11], v[27]); 1602 u[12] = _mm_add_epi32(v[12], v[28]); 1603 u[13] = _mm_add_epi32(v[13], v[29]); 1604 u[14] = _mm_add_epi32(v[14], v[30]); 1605 u[15] = _mm_add_epi32(v[15], v[31]); 1606 u[16] = _mm_sub_epi32(v[0], v[16]); 1607 u[17] = _mm_sub_epi32(v[1], v[17]); 1608 u[18] = _mm_sub_epi32(v[2], v[18]); 1609 u[19] = _mm_sub_epi32(v[3], v[19]); 1610 u[20] = _mm_sub_epi32(v[4], v[20]); 1611 u[21] = _mm_sub_epi32(v[5], v[21]); 1612 u[22] = _mm_sub_epi32(v[6], v[22]); 1613 u[23] = _mm_sub_epi32(v[7], v[23]); 1614 u[24] = _mm_sub_epi32(v[8], v[24]); 1615 u[25] = _mm_sub_epi32(v[9], v[25]); 1616 u[26] = _mm_sub_epi32(v[10], v[26]); 1617 u[27] = _mm_sub_epi32(v[11], v[27]); 1618 u[28] = _mm_sub_epi32(v[12], v[28]); 1619 u[29] = _mm_sub_epi32(v[13], v[29]); 1620 u[30] = _mm_sub_epi32(v[14], v[30]); 1621 u[31] = _mm_sub_epi32(v[15], v[31]); 1622 1623 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1624 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1625 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1626 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1627 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1628 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1629 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1630 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1631 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1632 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1633 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1634 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1635 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1636 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1637 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1638 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1639 v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); 1640 v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); 1641 v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); 1642 v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); 1643 v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); 1644 v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); 1645 v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); 1646 v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); 1647 v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); 1648 v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); 1649 v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); 1650 v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); 1651 v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); 1652 v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); 1653 v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); 1654 v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); 1655 1656 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1657 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1658 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1659 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1660 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1661 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1662 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1663 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1664 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 1665 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 1666 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 1667 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 1668 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 1669 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 1670 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 1671 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 1672 u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); 1673 u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); 1674 u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); 1675 u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); 1676 u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); 1677 u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); 1678 u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); 1679 u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); 1680 u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); 1681 u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); 1682 u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); 1683 u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); 1684 u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); 1685 u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); 1686 u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); 1687 u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); 1688 1689 s[0] = _mm_packs_epi32(u[0], u[1]); 1690 s[1] = _mm_packs_epi32(u[2], u[3]); 1691 s[2] = _mm_packs_epi32(u[4], u[5]); 1692 s[3] = _mm_packs_epi32(u[6], u[7]); 1693 s[4] = _mm_packs_epi32(u[8], u[9]); 1694 s[5] = _mm_packs_epi32(u[10], u[11]); 1695 s[6] = _mm_packs_epi32(u[12], u[13]); 1696 s[7] = _mm_packs_epi32(u[14], u[15]); 1697 s[8] = _mm_packs_epi32(u[16], u[17]); 1698 s[9] = _mm_packs_epi32(u[18], u[19]); 1699 s[10] = _mm_packs_epi32(u[20], u[21]); 1700 s[11] = _mm_packs_epi32(u[22], u[23]); 1701 s[12] = _mm_packs_epi32(u[24], u[25]); 1702 s[13] = _mm_packs_epi32(u[26], u[27]); 1703 s[14] = _mm_packs_epi32(u[28], u[29]); 1704 s[15] = _mm_packs_epi32(u[30], u[31]); 1705 1706 // stage 2 1707 u[0] = _mm_unpacklo_epi16(s[8], s[9]); 1708 u[1] = _mm_unpackhi_epi16(s[8], s[9]); 1709 u[2] = _mm_unpacklo_epi16(s[10], s[11]); 1710 u[3] = _mm_unpackhi_epi16(s[10], s[11]); 1711 u[4] = _mm_unpacklo_epi16(s[12], s[13]); 1712 u[5] = _mm_unpackhi_epi16(s[12], s[13]); 1713 u[6] = _mm_unpacklo_epi16(s[14], s[15]); 1714 u[7] = _mm_unpackhi_epi16(s[14], s[15]); 1715 1716 v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); 1717 v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); 1718 v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); 1719 v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); 1720 v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); 1721 v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); 1722 v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); 1723 v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); 1724 v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); 1725 v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); 1726 v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); 1727 v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); 1728 v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); 1729 v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); 1730 v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); 1731 v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); 1732 1733 u[0] = _mm_add_epi32(v[0], v[8]); 1734 u[1] = _mm_add_epi32(v[1], v[9]); 1735 u[2] = _mm_add_epi32(v[2], v[10]); 1736 u[3] = _mm_add_epi32(v[3], v[11]); 1737 u[4] = _mm_add_epi32(v[4], v[12]); 1738 u[5] = _mm_add_epi32(v[5], v[13]); 1739 u[6] = _mm_add_epi32(v[6], v[14]); 1740 u[7] = _mm_add_epi32(v[7], v[15]); 1741 u[8] = _mm_sub_epi32(v[0], v[8]); 1742 u[9] = _mm_sub_epi32(v[1], v[9]); 1743 u[10] = _mm_sub_epi32(v[2], v[10]); 1744 u[11] = _mm_sub_epi32(v[3], v[11]); 1745 u[12] = _mm_sub_epi32(v[4], v[12]); 1746 u[13] = _mm_sub_epi32(v[5], v[13]); 1747 u[14] = _mm_sub_epi32(v[6], v[14]); 1748 u[15] = _mm_sub_epi32(v[7], v[15]); 1749 1750 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1751 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1752 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1753 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1754 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1755 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1756 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1757 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1758 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1759 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1760 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1761 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1762 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1763 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1764 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1765 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1766 1767 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1768 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1769 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1770 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1771 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1772 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1773 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1774 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1775 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 1776 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 1777 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 1778 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 1779 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 1780 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 1781 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 1782 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 1783 1784 x[0] = _mm_add_epi16(s[0], s[4]); 1785 x[1] = _mm_add_epi16(s[1], s[5]); 1786 x[2] = _mm_add_epi16(s[2], s[6]); 1787 x[3] = _mm_add_epi16(s[3], s[7]); 1788 x[4] = _mm_sub_epi16(s[0], s[4]); 1789 x[5] = _mm_sub_epi16(s[1], s[5]); 1790 x[6] = _mm_sub_epi16(s[2], s[6]); 1791 x[7] = _mm_sub_epi16(s[3], s[7]); 1792 x[8] = _mm_packs_epi32(u[0], u[1]); 1793 x[9] = _mm_packs_epi32(u[2], u[3]); 1794 x[10] = _mm_packs_epi32(u[4], u[5]); 1795 x[11] = _mm_packs_epi32(u[6], u[7]); 1796 x[12] = _mm_packs_epi32(u[8], u[9]); 1797 x[13] = _mm_packs_epi32(u[10], u[11]); 1798 x[14] = _mm_packs_epi32(u[12], u[13]); 1799 x[15] = _mm_packs_epi32(u[14], u[15]); 1800 1801 // stage 3 1802 u[0] = _mm_unpacklo_epi16(x[4], x[5]); 1803 u[1] = _mm_unpackhi_epi16(x[4], x[5]); 1804 u[2] = _mm_unpacklo_epi16(x[6], x[7]); 1805 u[3] = _mm_unpackhi_epi16(x[6], x[7]); 1806 u[4] = _mm_unpacklo_epi16(x[12], x[13]); 1807 u[5] = _mm_unpackhi_epi16(x[12], x[13]); 1808 u[6] = _mm_unpacklo_epi16(x[14], x[15]); 1809 u[7] = _mm_unpackhi_epi16(x[14], x[15]); 1810 1811 v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); 1812 v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); 1813 v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); 1814 v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); 1815 v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); 1816 v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); 1817 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); 1818 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); 1819 v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); 1820 v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); 1821 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); 1822 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); 1823 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); 1824 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); 1825 v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); 1826 v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); 1827 1828 u[0] = _mm_add_epi32(v[0], v[4]); 1829 u[1] = _mm_add_epi32(v[1], v[5]); 1830 u[2] = _mm_add_epi32(v[2], v[6]); 1831 u[3] = _mm_add_epi32(v[3], v[7]); 1832 u[4] = _mm_sub_epi32(v[0], v[4]); 1833 u[5] = _mm_sub_epi32(v[1], v[5]); 1834 u[6] = _mm_sub_epi32(v[2], v[6]); 1835 u[7] = _mm_sub_epi32(v[3], v[7]); 1836 u[8] = _mm_add_epi32(v[8], v[12]); 1837 u[9] = _mm_add_epi32(v[9], v[13]); 1838 u[10] = _mm_add_epi32(v[10], v[14]); 1839 u[11] = _mm_add_epi32(v[11], v[15]); 1840 u[12] = _mm_sub_epi32(v[8], v[12]); 1841 u[13] = _mm_sub_epi32(v[9], v[13]); 1842 u[14] = _mm_sub_epi32(v[10], v[14]); 1843 u[15] = _mm_sub_epi32(v[11], v[15]); 1844 1845 u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1846 u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1847 u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1848 u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1849 u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1850 u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1851 u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1852 u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1853 u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1854 u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1855 u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1856 u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1857 u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1858 u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1859 u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1860 u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1861 1862 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1863 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1864 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1865 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1866 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1867 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1868 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1869 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1870 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1871 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1872 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1873 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1874 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1875 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1876 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1877 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1878 1879 s[0] = _mm_add_epi16(x[0], x[2]); 1880 s[1] = _mm_add_epi16(x[1], x[3]); 1881 s[2] = _mm_sub_epi16(x[0], x[2]); 1882 s[3] = _mm_sub_epi16(x[1], x[3]); 1883 s[4] = _mm_packs_epi32(v[0], v[1]); 1884 s[5] = _mm_packs_epi32(v[2], v[3]); 1885 s[6] = _mm_packs_epi32(v[4], v[5]); 1886 s[7] = _mm_packs_epi32(v[6], v[7]); 1887 s[8] = _mm_add_epi16(x[8], x[10]); 1888 s[9] = _mm_add_epi16(x[9], x[11]); 1889 s[10] = _mm_sub_epi16(x[8], x[10]); 1890 s[11] = _mm_sub_epi16(x[9], x[11]); 1891 s[12] = _mm_packs_epi32(v[8], v[9]); 1892 s[13] = _mm_packs_epi32(v[10], v[11]); 1893 s[14] = _mm_packs_epi32(v[12], v[13]); 1894 s[15] = _mm_packs_epi32(v[14], v[15]); 1895 1896 // stage 4 1897 u[0] = _mm_unpacklo_epi16(s[2], s[3]); 1898 u[1] = _mm_unpackhi_epi16(s[2], s[3]); 1899 u[2] = _mm_unpacklo_epi16(s[6], s[7]); 1900 u[3] = _mm_unpackhi_epi16(s[6], s[7]); 1901 u[4] = _mm_unpacklo_epi16(s[10], s[11]); 1902 u[5] = _mm_unpackhi_epi16(s[10], s[11]); 1903 u[6] = _mm_unpacklo_epi16(s[14], s[15]); 1904 u[7] = _mm_unpackhi_epi16(s[14], s[15]); 1905 1906 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); 1907 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); 1908 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 1909 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); 1910 v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 1911 v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 1912 v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 1913 v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 1914 v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); 1915 v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); 1916 v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); 1917 v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); 1918 v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); 1919 v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); 1920 v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); 1921 v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); 1922 1923 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1924 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1925 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1926 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1927 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1928 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1929 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1930 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1931 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 1932 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 1933 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 1934 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 1935 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 1936 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 1937 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 1938 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 1939 1940 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1941 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1942 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1943 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1944 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1945 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1946 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1947 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1948 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1949 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1950 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1951 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1952 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1953 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1954 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1955 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1956 1957 in[0] = s[0]; 1958 in[1] = _mm_sub_epi16(kZero, s[8]); 1959 in[2] = s[12]; 1960 in[3] = _mm_sub_epi16(kZero, s[4]); 1961 in[4] = _mm_packs_epi32(v[4], v[5]); 1962 in[5] = _mm_packs_epi32(v[12], v[13]); 1963 in[6] = _mm_packs_epi32(v[8], v[9]); 1964 in[7] = _mm_packs_epi32(v[0], v[1]); 1965 in[8] = _mm_packs_epi32(v[2], v[3]); 1966 in[9] = _mm_packs_epi32(v[10], v[11]); 1967 in[10] = _mm_packs_epi32(v[14], v[15]); 1968 in[11] = _mm_packs_epi32(v[6], v[7]); 1969 in[12] = s[5]; 1970 in[13] = _mm_sub_epi16(kZero, s[13]); 1971 in[14] = s[9]; 1972 in[15] = _mm_sub_epi16(kZero, s[1]); 1973} 1974 1975static void idct16_8col(__m128i *in) { 1976 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); 1977 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); 1978 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); 1979 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); 1980 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); 1981 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); 1982 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); 1983 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); 1984 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1985 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); 1986 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1987 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); 1988 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 1989 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1990 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1991 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 1992 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1993 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 1994 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1995 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1996 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1997 __m128i v[16], u[16], s[16], t[16]; 1998 1999 // stage 1 2000 s[0] = in[0]; 2001 s[1] = in[8]; 2002 s[2] = in[4]; 2003 s[3] = in[12]; 2004 s[4] = in[2]; 2005 s[5] = in[10]; 2006 s[6] = in[6]; 2007 s[7] = in[14]; 2008 s[8] = in[1]; 2009 s[9] = in[9]; 2010 s[10] = in[5]; 2011 s[11] = in[13]; 2012 s[12] = in[3]; 2013 s[13] = in[11]; 2014 s[14] = in[7]; 2015 s[15] = in[15]; 2016 2017 // stage 2 2018 u[0] = _mm_unpacklo_epi16(s[8], s[15]); 2019 u[1] = _mm_unpackhi_epi16(s[8], s[15]); 2020 u[2] = _mm_unpacklo_epi16(s[9], s[14]); 2021 u[3] = _mm_unpackhi_epi16(s[9], s[14]); 2022 u[4] = _mm_unpacklo_epi16(s[10], s[13]); 2023 u[5] = _mm_unpackhi_epi16(s[10], s[13]); 2024 u[6] = _mm_unpacklo_epi16(s[11], s[12]); 2025 u[7] = _mm_unpackhi_epi16(s[11], s[12]); 2026 2027 v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02); 2028 v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02); 2029 v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30); 2030 v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30); 2031 v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18); 2032 v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18); 2033 v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14); 2034 v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14); 2035 v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10); 2036 v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10); 2037 v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22); 2038 v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22); 2039 v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26); 2040 v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26); 2041 v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06); 2042 v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06); 2043 2044 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2045 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2046 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2047 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2048 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 2049 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 2050 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 2051 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 2052 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 2053 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 2054 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 2055 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 2056 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 2057 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 2058 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 2059 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 2060 2061 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2062 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2063 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2064 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2065 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 2066 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 2067 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 2068 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 2069 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 2070 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 2071 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 2072 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 2073 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 2074 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 2075 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 2076 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 2077 2078 s[8] = _mm_packs_epi32(u[0], u[1]); 2079 s[15] = _mm_packs_epi32(u[2], u[3]); 2080 s[9] = _mm_packs_epi32(u[4], u[5]); 2081 s[14] = _mm_packs_epi32(u[6], u[7]); 2082 s[10] = _mm_packs_epi32(u[8], u[9]); 2083 s[13] = _mm_packs_epi32(u[10], u[11]); 2084 s[11] = _mm_packs_epi32(u[12], u[13]); 2085 s[12] = _mm_packs_epi32(u[14], u[15]); 2086 2087 // stage 3 2088 t[0] = s[0]; 2089 t[1] = s[1]; 2090 t[2] = s[2]; 2091 t[3] = s[3]; 2092 u[0] = _mm_unpacklo_epi16(s[4], s[7]); 2093 u[1] = _mm_unpackhi_epi16(s[4], s[7]); 2094 u[2] = _mm_unpacklo_epi16(s[5], s[6]); 2095 u[3] = _mm_unpackhi_epi16(s[5], s[6]); 2096 2097 v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04); 2098 v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04); 2099 v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28); 2100 v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28); 2101 v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20); 2102 v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20); 2103 v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12); 2104 v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12); 2105 2106 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2107 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2108 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2109 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2110 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 2111 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 2112 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 2113 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 2114 2115 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2116 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2117 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2118 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2119 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 2120 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 2121 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 2122 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 2123 2124 t[4] = _mm_packs_epi32(u[0], u[1]); 2125 t[7] = _mm_packs_epi32(u[2], u[3]); 2126 t[5] = _mm_packs_epi32(u[4], u[5]); 2127 t[6] = _mm_packs_epi32(u[6], u[7]); 2128 t[8] = _mm_add_epi16(s[8], s[9]); 2129 t[9] = _mm_sub_epi16(s[8], s[9]); 2130 t[10] = _mm_sub_epi16(s[11], s[10]); 2131 t[11] = _mm_add_epi16(s[10], s[11]); 2132 t[12] = _mm_add_epi16(s[12], s[13]); 2133 t[13] = _mm_sub_epi16(s[12], s[13]); 2134 t[14] = _mm_sub_epi16(s[15], s[14]); 2135 t[15] = _mm_add_epi16(s[14], s[15]); 2136 2137 // stage 4 2138 u[0] = _mm_unpacklo_epi16(t[0], t[1]); 2139 u[1] = _mm_unpackhi_epi16(t[0], t[1]); 2140 u[2] = _mm_unpacklo_epi16(t[2], t[3]); 2141 u[3] = _mm_unpackhi_epi16(t[2], t[3]); 2142 u[4] = _mm_unpacklo_epi16(t[9], t[14]); 2143 u[5] = _mm_unpackhi_epi16(t[9], t[14]); 2144 u[6] = _mm_unpacklo_epi16(t[10], t[13]); 2145 u[7] = _mm_unpackhi_epi16(t[10], t[13]); 2146 2147 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 2148 v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 2149 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 2150 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); 2151 v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08); 2152 v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08); 2153 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); 2154 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); 2155 v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24); 2156 v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24); 2157 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08); 2158 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08); 2159 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08); 2160 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08); 2161 v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24); 2162 v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24); 2163 2164 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2165 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2166 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2167 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2168 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 2169 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 2170 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 2171 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 2172 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 2173 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 2174 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 2175 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 2176 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 2177 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 2178 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 2179 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 2180 2181 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2182 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2183 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2184 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2185 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 2186 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 2187 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 2188 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 2189 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 2190 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 2191 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 2192 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 2193 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 2194 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 2195 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 2196 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 2197 2198 s[0] = _mm_packs_epi32(u[0], u[1]); 2199 s[1] = _mm_packs_epi32(u[2], u[3]); 2200 s[2] = _mm_packs_epi32(u[4], u[5]); 2201 s[3] = _mm_packs_epi32(u[6], u[7]); 2202 s[4] = _mm_add_epi16(t[4], t[5]); 2203 s[5] = _mm_sub_epi16(t[4], t[5]); 2204 s[6] = _mm_sub_epi16(t[7], t[6]); 2205 s[7] = _mm_add_epi16(t[6], t[7]); 2206 s[8] = t[8]; 2207 s[15] = t[15]; 2208 s[9] = _mm_packs_epi32(u[8], u[9]); 2209 s[14] = _mm_packs_epi32(u[10], u[11]); 2210 s[10] = _mm_packs_epi32(u[12], u[13]); 2211 s[13] = _mm_packs_epi32(u[14], u[15]); 2212 s[11] = t[11]; 2213 s[12] = t[12]; 2214 2215 // stage 5 2216 t[0] = _mm_add_epi16(s[0], s[3]); 2217 t[1] = _mm_add_epi16(s[1], s[2]); 2218 t[2] = _mm_sub_epi16(s[1], s[2]); 2219 t[3] = _mm_sub_epi16(s[0], s[3]); 2220 t[4] = s[4]; 2221 t[7] = s[7]; 2222 2223 u[0] = _mm_unpacklo_epi16(s[5], s[6]); 2224 u[1] = _mm_unpackhi_epi16(s[5], s[6]); 2225 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 2226 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 2227 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 2228 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 2229 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2230 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2231 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2232 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2233 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2234 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2235 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2236 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2237 t[5] = _mm_packs_epi32(u[0], u[1]); 2238 t[6] = _mm_packs_epi32(u[2], u[3]); 2239 2240 t[8] = _mm_add_epi16(s[8], s[11]); 2241 t[9] = _mm_add_epi16(s[9], s[10]); 2242 t[10] = _mm_sub_epi16(s[9], s[10]); 2243 t[11] = _mm_sub_epi16(s[8], s[11]); 2244 t[12] = _mm_sub_epi16(s[15], s[12]); 2245 t[13] = _mm_sub_epi16(s[14], s[13]); 2246 t[14] = _mm_add_epi16(s[13], s[14]); 2247 t[15] = _mm_add_epi16(s[12], s[15]); 2248 2249 // stage 6 2250 s[0] = _mm_add_epi16(t[0], t[7]); 2251 s[1] = _mm_add_epi16(t[1], t[6]); 2252 s[2] = _mm_add_epi16(t[2], t[5]); 2253 s[3] = _mm_add_epi16(t[3], t[4]); 2254 s[4] = _mm_sub_epi16(t[3], t[4]); 2255 s[5] = _mm_sub_epi16(t[2], t[5]); 2256 s[6] = _mm_sub_epi16(t[1], t[6]); 2257 s[7] = _mm_sub_epi16(t[0], t[7]); 2258 s[8] = t[8]; 2259 s[9] = t[9]; 2260 2261 u[0] = _mm_unpacklo_epi16(t[10], t[13]); 2262 u[1] = _mm_unpackhi_epi16(t[10], t[13]); 2263 u[2] = _mm_unpacklo_epi16(t[11], t[12]); 2264 u[3] = _mm_unpackhi_epi16(t[11], t[12]); 2265 2266 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 2267 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 2268 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 2269 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 2270 v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 2271 v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 2272 v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 2273 v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 2274 2275 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2276 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2277 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2278 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2279 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 2280 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 2281 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 2282 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 2283 2284 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2285 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2286 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2287 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2288 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 2289 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 2290 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 2291 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 2292 2293 s[10] = _mm_packs_epi32(u[0], u[1]); 2294 s[13] = _mm_packs_epi32(u[2], u[3]); 2295 s[11] = _mm_packs_epi32(u[4], u[5]); 2296 s[12] = _mm_packs_epi32(u[6], u[7]); 2297 s[14] = t[14]; 2298 s[15] = t[15]; 2299 2300 // stage 7 2301 in[0] = _mm_add_epi16(s[0], s[15]); 2302 in[1] = _mm_add_epi16(s[1], s[14]); 2303 in[2] = _mm_add_epi16(s[2], s[13]); 2304 in[3] = _mm_add_epi16(s[3], s[12]); 2305 in[4] = _mm_add_epi16(s[4], s[11]); 2306 in[5] = _mm_add_epi16(s[5], s[10]); 2307 in[6] = _mm_add_epi16(s[6], s[9]); 2308 in[7] = _mm_add_epi16(s[7], s[8]); 2309 in[8] = _mm_sub_epi16(s[7], s[8]); 2310 in[9] = _mm_sub_epi16(s[6], s[9]); 2311 in[10] = _mm_sub_epi16(s[5], s[10]); 2312 in[11] = _mm_sub_epi16(s[4], s[11]); 2313 in[12] = _mm_sub_epi16(s[3], s[12]); 2314 in[13] = _mm_sub_epi16(s[2], s[13]); 2315 in[14] = _mm_sub_epi16(s[1], s[14]); 2316 in[15] = _mm_sub_epi16(s[0], s[15]); 2317} 2318 2319static void idct16_sse2(__m128i *in0, __m128i *in1) { 2320 array_transpose_16x16(in0, in1); 2321 idct16_8col(in0); 2322 idct16_8col(in1); 2323} 2324 2325static void iadst16_sse2(__m128i *in0, __m128i *in1) { 2326 array_transpose_16x16(in0, in1); 2327 iadst16_8col(in0); 2328 iadst16_8col(in1); 2329} 2330 2331void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride, 2332 int tx_type) { 2333 __m128i in0[16], in1[16]; 2334 2335 load_buffer_8x16(input, in0); 2336 input += 8; 2337 load_buffer_8x16(input, in1); 2338 2339 switch (tx_type) { 2340 case 0: // DCT_DCT 2341 idct16_sse2(in0, in1); 2342 idct16_sse2(in0, in1); 2343 break; 2344 case 1: // ADST_DCT 2345 idct16_sse2(in0, in1); 2346 iadst16_sse2(in0, in1); 2347 break; 2348 case 2: // DCT_ADST 2349 iadst16_sse2(in0, in1); 2350 idct16_sse2(in0, in1); 2351 break; 2352 case 3: // ADST_ADST 2353 iadst16_sse2(in0, in1); 2354 iadst16_sse2(in0, in1); 2355 break; 2356 default: 2357 assert(0); 2358 break; 2359 } 2360 2361 write_buffer_8x16(dest, in0, stride); 2362 dest += 8; 2363 write_buffer_8x16(dest, in1, stride); 2364} 2365 2366void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, 2367 int stride) { 2368 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 2369 const __m128i final_rounding = _mm_set1_epi16(1<<5); 2370 const __m128i zero = _mm_setzero_si128(); 2371 2372 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 2373 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 2374 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 2375 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 2376 2377 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 2378 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 2379 2380 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 2381 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 2382 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 2383 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 2384 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 2385 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); 2386 2387 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 2388 __m128i in[16], l[16]; 2389 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, 2390 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 2391 stp1_8_0, stp1_12_0; 2392 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 2393 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14; 2394 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 2395 int i; 2396 // First 1-D inverse DCT 2397 // Load input data. 2398 in[0] = _mm_load_si128((const __m128i *)input); 2399 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); 2400 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); 2401 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); 2402 2403 TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]); 2404 2405 // Stage2 2406 { 2407 const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero); 2408 const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]); 2409 2410 tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); 2411 tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); 2412 tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); 2413 tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); 2414 2415 tmp0 = _mm_add_epi32(tmp0, rounding); 2416 tmp2 = _mm_add_epi32(tmp2, rounding); 2417 tmp5 = _mm_add_epi32(tmp5, rounding); 2418 tmp7 = _mm_add_epi32(tmp7, rounding); 2419 2420 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2421 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2422 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); 2423 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); 2424 2425 stp2_8 = _mm_packs_epi32(tmp0, tmp2); 2426 stp2_11 = _mm_packs_epi32(tmp5, tmp7); 2427 } 2428 2429 // Stage3 2430 { 2431 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero); 2432 2433 tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); 2434 tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); 2435 2436 tmp0 = _mm_add_epi32(tmp0, rounding); 2437 tmp2 = _mm_add_epi32(tmp2, rounding); 2438 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2439 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2440 2441 stp1_13 = _mm_unpackhi_epi64(stp2_11, zero); 2442 stp1_14 = _mm_unpackhi_epi64(stp2_8, zero); 2443 2444 stp1_4 = _mm_packs_epi32(tmp0, tmp2); 2445 } 2446 2447 // Stage4 2448 { 2449 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); 2450 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14); 2451 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13); 2452 2453 tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); 2454 tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); 2455 tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); 2456 tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); 2457 tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); 2458 tmp7 = _mm_madd_epi16(lo_10_13, stg4_7); 2459 2460 tmp0 = _mm_add_epi32(tmp0, rounding); 2461 tmp2 = _mm_add_epi32(tmp2, rounding); 2462 tmp1 = _mm_add_epi32(tmp1, rounding); 2463 tmp3 = _mm_add_epi32(tmp3, rounding); 2464 tmp5 = _mm_add_epi32(tmp5, rounding); 2465 tmp7 = _mm_add_epi32(tmp7, rounding); 2466 2467 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2468 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2469 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); 2470 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 2471 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); 2472 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); 2473 2474 stp1_0 = _mm_packs_epi32(tmp0, tmp0); 2475 stp1_1 = _mm_packs_epi32(tmp2, tmp2); 2476 stp2_9 = _mm_packs_epi32(tmp1, tmp3); 2477 stp2_10 = _mm_packs_epi32(tmp5, tmp7); 2478 2479 stp2_6 = _mm_unpackhi_epi64(stp1_4, zero); 2480 } 2481 2482 // Stage5 and Stage6 2483 { 2484 tmp0 = _mm_add_epi16(stp2_8, stp2_11); 2485 tmp1 = _mm_sub_epi16(stp2_8, stp2_11); 2486 tmp2 = _mm_add_epi16(stp2_9, stp2_10); 2487 tmp3 = _mm_sub_epi16(stp2_9, stp2_10); 2488 2489 stp1_9 = _mm_unpacklo_epi64(tmp2, zero); 2490 stp1_10 = _mm_unpacklo_epi64(tmp3, zero); 2491 stp1_8 = _mm_unpacklo_epi64(tmp0, zero); 2492 stp1_11 = _mm_unpacklo_epi64(tmp1, zero); 2493 2494 stp1_13 = _mm_unpackhi_epi64(tmp3, zero); 2495 stp1_14 = _mm_unpackhi_epi64(tmp2, zero); 2496 stp1_12 = _mm_unpackhi_epi64(tmp1, zero); 2497 stp1_15 = _mm_unpackhi_epi64(tmp0, zero); 2498 } 2499 2500 // Stage6 2501 { 2502 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4); 2503 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); 2504 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); 2505 2506 tmp1 = _mm_madd_epi16(lo_6_5, stg4_1); 2507 tmp3 = _mm_madd_epi16(lo_6_5, stg4_0); 2508 tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); 2509 tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); 2510 tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); 2511 tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); 2512 2513 tmp1 = _mm_add_epi32(tmp1, rounding); 2514 tmp3 = _mm_add_epi32(tmp3, rounding); 2515 tmp0 = _mm_add_epi32(tmp0, rounding); 2516 tmp2 = _mm_add_epi32(tmp2, rounding); 2517 tmp4 = _mm_add_epi32(tmp4, rounding); 2518 tmp6 = _mm_add_epi32(tmp6, rounding); 2519 2520 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); 2521 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 2522 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2523 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2524 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 2525 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 2526 2527 stp1_6 = _mm_packs_epi32(tmp3, tmp1); 2528 2529 stp2_10 = _mm_packs_epi32(tmp0, zero); 2530 stp2_13 = _mm_packs_epi32(tmp2, zero); 2531 stp2_11 = _mm_packs_epi32(tmp4, zero); 2532 stp2_12 = _mm_packs_epi32(tmp6, zero); 2533 2534 tmp0 = _mm_add_epi16(stp1_0, stp1_4); 2535 tmp1 = _mm_sub_epi16(stp1_0, stp1_4); 2536 tmp2 = _mm_add_epi16(stp1_1, stp1_6); 2537 tmp3 = _mm_sub_epi16(stp1_1, stp1_6); 2538 2539 stp2_0 = _mm_unpackhi_epi64(tmp0, zero); 2540 stp2_1 = _mm_unpacklo_epi64(tmp2, zero); 2541 stp2_2 = _mm_unpackhi_epi64(tmp2, zero); 2542 stp2_3 = _mm_unpacklo_epi64(tmp0, zero); 2543 stp2_4 = _mm_unpacklo_epi64(tmp1, zero); 2544 stp2_5 = _mm_unpackhi_epi64(tmp3, zero); 2545 stp2_6 = _mm_unpacklo_epi64(tmp3, zero); 2546 stp2_7 = _mm_unpackhi_epi64(tmp1, zero); 2547 } 2548 2549 // Stage7. Left 8x16 only. 2550 l[0] = _mm_add_epi16(stp2_0, stp1_15); 2551 l[1] = _mm_add_epi16(stp2_1, stp1_14); 2552 l[2] = _mm_add_epi16(stp2_2, stp2_13); 2553 l[3] = _mm_add_epi16(stp2_3, stp2_12); 2554 l[4] = _mm_add_epi16(stp2_4, stp2_11); 2555 l[5] = _mm_add_epi16(stp2_5, stp2_10); 2556 l[6] = _mm_add_epi16(stp2_6, stp1_9); 2557 l[7] = _mm_add_epi16(stp2_7, stp1_8); 2558 l[8] = _mm_sub_epi16(stp2_7, stp1_8); 2559 l[9] = _mm_sub_epi16(stp2_6, stp1_9); 2560 l[10] = _mm_sub_epi16(stp2_5, stp2_10); 2561 l[11] = _mm_sub_epi16(stp2_4, stp2_11); 2562 l[12] = _mm_sub_epi16(stp2_3, stp2_12); 2563 l[13] = _mm_sub_epi16(stp2_2, stp2_13); 2564 l[14] = _mm_sub_epi16(stp2_1, stp1_14); 2565 l[15] = _mm_sub_epi16(stp2_0, stp1_15); 2566 2567 // Second 1-D inverse transform, performed per 8x16 block 2568 for (i = 0; i < 2; i++) { 2569 array_transpose_4X8(l + 8*i, in); 2570 2571 IDCT16_10 2572 2573 // Stage7 2574 in[0] = _mm_add_epi16(stp2_0, stp1_15); 2575 in[1] = _mm_add_epi16(stp2_1, stp1_14); 2576 in[2] = _mm_add_epi16(stp2_2, stp2_13); 2577 in[3] = _mm_add_epi16(stp2_3, stp2_12); 2578 in[4] = _mm_add_epi16(stp2_4, stp2_11); 2579 in[5] = _mm_add_epi16(stp2_5, stp2_10); 2580 in[6] = _mm_add_epi16(stp2_6, stp1_9); 2581 in[7] = _mm_add_epi16(stp2_7, stp1_8); 2582 in[8] = _mm_sub_epi16(stp2_7, stp1_8); 2583 in[9] = _mm_sub_epi16(stp2_6, stp1_9); 2584 in[10] = _mm_sub_epi16(stp2_5, stp2_10); 2585 in[11] = _mm_sub_epi16(stp2_4, stp2_11); 2586 in[12] = _mm_sub_epi16(stp2_3, stp2_12); 2587 in[13] = _mm_sub_epi16(stp2_2, stp2_13); 2588 in[14] = _mm_sub_epi16(stp2_1, stp1_14); 2589 in[15] = _mm_sub_epi16(stp2_0, stp1_15); 2590 2591 // Final rounding and shift 2592 in[0] = _mm_adds_epi16(in[0], final_rounding); 2593 in[1] = _mm_adds_epi16(in[1], final_rounding); 2594 in[2] = _mm_adds_epi16(in[2], final_rounding); 2595 in[3] = _mm_adds_epi16(in[3], final_rounding); 2596 in[4] = _mm_adds_epi16(in[4], final_rounding); 2597 in[5] = _mm_adds_epi16(in[5], final_rounding); 2598 in[6] = _mm_adds_epi16(in[6], final_rounding); 2599 in[7] = _mm_adds_epi16(in[7], final_rounding); 2600 in[8] = _mm_adds_epi16(in[8], final_rounding); 2601 in[9] = _mm_adds_epi16(in[9], final_rounding); 2602 in[10] = _mm_adds_epi16(in[10], final_rounding); 2603 in[11] = _mm_adds_epi16(in[11], final_rounding); 2604 in[12] = _mm_adds_epi16(in[12], final_rounding); 2605 in[13] = _mm_adds_epi16(in[13], final_rounding); 2606 in[14] = _mm_adds_epi16(in[14], final_rounding); 2607 in[15] = _mm_adds_epi16(in[15], final_rounding); 2608 2609 in[0] = _mm_srai_epi16(in[0], 6); 2610 in[1] = _mm_srai_epi16(in[1], 6); 2611 in[2] = _mm_srai_epi16(in[2], 6); 2612 in[3] = _mm_srai_epi16(in[3], 6); 2613 in[4] = _mm_srai_epi16(in[4], 6); 2614 in[5] = _mm_srai_epi16(in[5], 6); 2615 in[6] = _mm_srai_epi16(in[6], 6); 2616 in[7] = _mm_srai_epi16(in[7], 6); 2617 in[8] = _mm_srai_epi16(in[8], 6); 2618 in[9] = _mm_srai_epi16(in[9], 6); 2619 in[10] = _mm_srai_epi16(in[10], 6); 2620 in[11] = _mm_srai_epi16(in[11], 6); 2621 in[12] = _mm_srai_epi16(in[12], 6); 2622 in[13] = _mm_srai_epi16(in[13], 6); 2623 in[14] = _mm_srai_epi16(in[14], 6); 2624 in[15] = _mm_srai_epi16(in[15], 6); 2625 2626 RECON_AND_STORE(dest, in[0]); 2627 RECON_AND_STORE(dest, in[1]); 2628 RECON_AND_STORE(dest, in[2]); 2629 RECON_AND_STORE(dest, in[3]); 2630 RECON_AND_STORE(dest, in[4]); 2631 RECON_AND_STORE(dest, in[5]); 2632 RECON_AND_STORE(dest, in[6]); 2633 RECON_AND_STORE(dest, in[7]); 2634 RECON_AND_STORE(dest, in[8]); 2635 RECON_AND_STORE(dest, in[9]); 2636 RECON_AND_STORE(dest, in[10]); 2637 RECON_AND_STORE(dest, in[11]); 2638 RECON_AND_STORE(dest, in[12]); 2639 RECON_AND_STORE(dest, in[13]); 2640 RECON_AND_STORE(dest, in[14]); 2641 RECON_AND_STORE(dest, in[15]); 2642 2643 dest += 8 - (stride * 16); 2644 } 2645} 2646 2647#define LOAD_DQCOEFF(reg, input) \ 2648 { \ 2649 reg = _mm_load_si128((const __m128i *) input); \ 2650 input += 8; \ 2651 } \ 2652 2653#define IDCT32_34 \ 2654/* Stage1 */ \ 2655{ \ 2656 const __m128i zero = _mm_setzero_si128();\ 2657 const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \ 2658 const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \ 2659 \ 2660 const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \ 2661 const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \ 2662 \ 2663 const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \ 2664 const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \ 2665 \ 2666 const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \ 2667 const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \ 2668 \ 2669 MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \ 2670 stg1_1, stp1_16, stp1_31); \ 2671 MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \ 2672 stg1_7, stp1_19, stp1_28); \ 2673 MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \ 2674 stg1_9, stp1_20, stp1_27); \ 2675 MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \ 2676 stg1_15, stp1_23, stp1_24); \ 2677} \ 2678\ 2679/* Stage2 */ \ 2680{ \ 2681 const __m128i zero = _mm_setzero_si128();\ 2682 const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \ 2683 const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \ 2684 \ 2685 const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \ 2686 const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \ 2687 \ 2688 MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \ 2689 stg2_1, stp2_8, stp2_15); \ 2690 MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \ 2691 stg2_7, stp2_11, stp2_12); \ 2692 \ 2693 stp2_16 = stp1_16; \ 2694 stp2_19 = stp1_19; \ 2695 \ 2696 stp2_20 = stp1_20; \ 2697 stp2_23 = stp1_23; \ 2698 \ 2699 stp2_24 = stp1_24; \ 2700 stp2_27 = stp1_27; \ 2701 \ 2702 stp2_28 = stp1_28; \ 2703 stp2_31 = stp1_31; \ 2704} \ 2705\ 2706/* Stage3 */ \ 2707{ \ 2708 const __m128i zero = _mm_setzero_si128();\ 2709 const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \ 2710 const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \ 2711 \ 2712 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \ 2713 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \ 2714 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \ 2715 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \ 2716 \ 2717 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \ 2718 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \ 2719 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \ 2720 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \ 2721 \ 2722 MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \ 2723 stg3_1, stp1_4, stp1_7); \ 2724 \ 2725 stp1_8 = stp2_8; \ 2726 stp1_11 = stp2_11; \ 2727 stp1_12 = stp2_12; \ 2728 stp1_15 = stp2_15; \ 2729 \ 2730 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ 2731 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ 2732 stp1_18, stp1_29) \ 2733 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ 2734 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ 2735 stp1_22, stp1_25) \ 2736 \ 2737 stp1_16 = stp2_16; \ 2738 stp1_31 = stp2_31; \ 2739 stp1_19 = stp2_19; \ 2740 stp1_20 = stp2_20; \ 2741 stp1_23 = stp2_23; \ 2742 stp1_24 = stp2_24; \ 2743 stp1_27 = stp2_27; \ 2744 stp1_28 = stp2_28; \ 2745} \ 2746\ 2747/* Stage4 */ \ 2748{ \ 2749 const __m128i zero = _mm_setzero_si128();\ 2750 const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \ 2751 const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \ 2752 \ 2753 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \ 2754 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \ 2755 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \ 2756 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \ 2757 \ 2758 MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \ 2759 stg4_1, stp2_0, stp2_1); \ 2760 \ 2761 stp2_4 = stp1_4; \ 2762 stp2_5 = stp1_4; \ 2763 stp2_6 = stp1_7; \ 2764 stp2_7 = stp1_7; \ 2765 \ 2766 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ 2767 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ 2768 stp2_10, stp2_13) \ 2769 \ 2770 stp2_8 = stp1_8; \ 2771 stp2_15 = stp1_15; \ 2772 stp2_11 = stp1_11; \ 2773 stp2_12 = stp1_12; \ 2774 \ 2775 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ 2776 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ 2777 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ 2778 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ 2779 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ 2780 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ 2781 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ 2782 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ 2783 \ 2784 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ 2785 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ 2786 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ 2787 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ 2788 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ 2789 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ 2790 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ 2791 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ 2792} \ 2793\ 2794/* Stage5 */ \ 2795{ \ 2796 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 2797 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 2798 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ 2799 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ 2800 \ 2801 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ 2802 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ 2803 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ 2804 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ 2805 \ 2806 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 2807 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 2808 \ 2809 stp1_0 = stp2_0; \ 2810 stp1_1 = stp2_1; \ 2811 stp1_2 = stp2_1; \ 2812 stp1_3 = stp2_0; \ 2813 \ 2814 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 2815 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 2816 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 2817 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 2818 \ 2819 tmp0 = _mm_add_epi32(tmp0, rounding); \ 2820 tmp1 = _mm_add_epi32(tmp1, rounding); \ 2821 tmp2 = _mm_add_epi32(tmp2, rounding); \ 2822 tmp3 = _mm_add_epi32(tmp3, rounding); \ 2823 \ 2824 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 2825 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 2826 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 2827 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 2828 \ 2829 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 2830 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 2831 \ 2832 stp1_4 = stp2_4; \ 2833 stp1_7 = stp2_7; \ 2834 \ 2835 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ 2836 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 2837 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 2838 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ 2839 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ 2840 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 2841 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 2842 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ 2843 \ 2844 stp1_16 = stp2_16; \ 2845 stp1_17 = stp2_17; \ 2846 \ 2847 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ 2848 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ 2849 stp1_19, stp1_28) \ 2850 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ 2851 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ 2852 stp1_21, stp1_26) \ 2853 \ 2854 stp1_22 = stp2_22; \ 2855 stp1_23 = stp2_23; \ 2856 stp1_24 = stp2_24; \ 2857 stp1_25 = stp2_25; \ 2858 stp1_30 = stp2_30; \ 2859 stp1_31 = stp2_31; \ 2860} \ 2861\ 2862/* Stage6 */ \ 2863{ \ 2864 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 2865 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 2866 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 2867 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 2868 \ 2869 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ 2870 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 2871 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 2872 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ 2873 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ 2874 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 2875 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 2876 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ 2877 \ 2878 stp2_8 = stp1_8; \ 2879 stp2_9 = stp1_9; \ 2880 stp2_14 = stp1_14; \ 2881 stp2_15 = stp1_15; \ 2882 \ 2883 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ 2884 stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ 2885 stp2_13, stp2_11, stp2_12) \ 2886 \ 2887 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ 2888 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ 2889 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ 2890 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ 2891 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ 2892 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ 2893 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ 2894 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ 2895 \ 2896 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ 2897 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ 2898 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ 2899 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ 2900 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ 2901 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ 2902 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ 2903 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ 2904} \ 2905\ 2906/* Stage7 */ \ 2907{ \ 2908 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ 2909 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ 2910 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 2911 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 2912 \ 2913 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ 2914 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ 2915 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ 2916 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ 2917 \ 2918 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ 2919 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ 2920 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ 2921 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ 2922 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ 2923 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ 2924 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ 2925 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ 2926 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ 2927 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ 2928 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ 2929 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ 2930 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ 2931 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ 2932 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ 2933 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ 2934 \ 2935 stp1_16 = stp2_16; \ 2936 stp1_17 = stp2_17; \ 2937 stp1_18 = stp2_18; \ 2938 stp1_19 = stp2_19; \ 2939 \ 2940 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ 2941 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ 2942 stp1_21, stp1_26) \ 2943 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ 2944 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ 2945 stp1_23, stp1_24) \ 2946 \ 2947 stp1_28 = stp2_28; \ 2948 stp1_29 = stp2_29; \ 2949 stp1_30 = stp2_30; \ 2950 stp1_31 = stp2_31; \ 2951} 2952 2953 2954#define IDCT32 \ 2955/* Stage1 */ \ 2956{ \ 2957 const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \ 2958 const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \ 2959 const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \ 2960 const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \ 2961 \ 2962 const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \ 2963 const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \ 2964 const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \ 2965 const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \ 2966 \ 2967 const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \ 2968 const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \ 2969 const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \ 2970 const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \ 2971 \ 2972 const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \ 2973 const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \ 2974 const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \ 2975 const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \ 2976 \ 2977 MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ 2978 stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \ 2979 stp1_17, stp1_30) \ 2980 MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \ 2981 stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \ 2982 stp1_19, stp1_28) \ 2983 MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ 2984 stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ 2985 stp1_21, stp1_26) \ 2986 MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ 2987 stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ 2988 stp1_23, stp1_24) \ 2989} \ 2990\ 2991/* Stage2 */ \ 2992{ \ 2993 const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \ 2994 const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \ 2995 const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \ 2996 const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \ 2997 \ 2998 const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \ 2999 const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \ 3000 const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \ 3001 const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \ 3002 \ 3003 MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ 3004 stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ 3005 stp2_14) \ 3006 MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ 3007 stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \ 3008 stp2_11, stp2_12) \ 3009 \ 3010 stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ 3011 stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ 3012 stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ 3013 stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ 3014 \ 3015 stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ 3016 stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ 3017 stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ 3018 stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ 3019 \ 3020 stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ 3021 stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ 3022 stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ 3023 stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ 3024 \ 3025 stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ 3026 stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ 3027 stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ 3028 stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ 3029} \ 3030\ 3031/* Stage3 */ \ 3032{ \ 3033 const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \ 3034 const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \ 3035 const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \ 3036 const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \ 3037 \ 3038 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ 3039 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ 3040 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ 3041 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ 3042 \ 3043 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 3044 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 3045 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ 3046 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ 3047 \ 3048 MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \ 3049 stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \ 3050 stp1_6) \ 3051 \ 3052 stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ 3053 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ 3054 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ 3055 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ 3056 stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ 3057 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ 3058 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ 3059 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ 3060 \ 3061 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ 3062 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ 3063 stp1_18, stp1_29) \ 3064 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ 3065 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ 3066 stp1_22, stp1_25) \ 3067 \ 3068 stp1_16 = stp2_16; \ 3069 stp1_31 = stp2_31; \ 3070 stp1_19 = stp2_19; \ 3071 stp1_20 = stp2_20; \ 3072 stp1_23 = stp2_23; \ 3073 stp1_24 = stp2_24; \ 3074 stp1_27 = stp2_27; \ 3075 stp1_28 = stp2_28; \ 3076} \ 3077\ 3078/* Stage4 */ \ 3079{ \ 3080 const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \ 3081 const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \ 3082 const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \ 3083 const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \ 3084 \ 3085 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ 3086 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ 3087 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 3088 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 3089 \ 3090 MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \ 3091 stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \ 3092 stp2_2, stp2_3) \ 3093 \ 3094 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ 3095 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ 3096 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ 3097 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ 3098 \ 3099 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ 3100 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ 3101 stp2_10, stp2_13) \ 3102 \ 3103 stp2_8 = stp1_8; \ 3104 stp2_15 = stp1_15; \ 3105 stp2_11 = stp1_11; \ 3106 stp2_12 = stp1_12; \ 3107 \ 3108 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ 3109 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ 3110 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ 3111 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ 3112 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ 3113 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ 3114 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ 3115 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ 3116 \ 3117 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ 3118 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ 3119 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ 3120 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ 3121 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ 3122 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ 3123 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ 3124 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ 3125} \ 3126\ 3127/* Stage5 */ \ 3128{ \ 3129 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 3130 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 3131 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ 3132 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ 3133 \ 3134 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ 3135 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ 3136 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ 3137 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ 3138 \ 3139 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 3140 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 3141 \ 3142 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ 3143 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ 3144 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ 3145 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ 3146 \ 3147 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 3148 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 3149 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 3150 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 3151 \ 3152 tmp0 = _mm_add_epi32(tmp0, rounding); \ 3153 tmp1 = _mm_add_epi32(tmp1, rounding); \ 3154 tmp2 = _mm_add_epi32(tmp2, rounding); \ 3155 tmp3 = _mm_add_epi32(tmp3, rounding); \ 3156 \ 3157 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 3158 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 3159 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 3160 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 3161 \ 3162 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 3163 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 3164 \ 3165 stp1_4 = stp2_4; \ 3166 stp1_7 = stp2_7; \ 3167 \ 3168 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ 3169 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 3170 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 3171 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ 3172 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ 3173 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 3174 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 3175 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ 3176 \ 3177 stp1_16 = stp2_16; \ 3178 stp1_17 = stp2_17; \ 3179 \ 3180 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ 3181 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ 3182 stp1_19, stp1_28) \ 3183 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ 3184 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ 3185 stp1_21, stp1_26) \ 3186 \ 3187 stp1_22 = stp2_22; \ 3188 stp1_23 = stp2_23; \ 3189 stp1_24 = stp2_24; \ 3190 stp1_25 = stp2_25; \ 3191 stp1_30 = stp2_30; \ 3192 stp1_31 = stp2_31; \ 3193} \ 3194\ 3195/* Stage6 */ \ 3196{ \ 3197 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 3198 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 3199 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 3200 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 3201 \ 3202 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ 3203 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 3204 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 3205 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ 3206 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ 3207 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 3208 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 3209 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ 3210 \ 3211 stp2_8 = stp1_8; \ 3212 stp2_9 = stp1_9; \ 3213 stp2_14 = stp1_14; \ 3214 stp2_15 = stp1_15; \ 3215 \ 3216 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ 3217 stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ 3218 stp2_13, stp2_11, stp2_12) \ 3219 \ 3220 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ 3221 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ 3222 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ 3223 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ 3224 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ 3225 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ 3226 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ 3227 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ 3228 \ 3229 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ 3230 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ 3231 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ 3232 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ 3233 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ 3234 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ 3235 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ 3236 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ 3237} \ 3238\ 3239/* Stage7 */ \ 3240{ \ 3241 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ 3242 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ 3243 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 3244 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 3245 \ 3246 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ 3247 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ 3248 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ 3249 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ 3250 \ 3251 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ 3252 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ 3253 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ 3254 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ 3255 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ 3256 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ 3257 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ 3258 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ 3259 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ 3260 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ 3261 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ 3262 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ 3263 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ 3264 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ 3265 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ 3266 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ 3267 \ 3268 stp1_16 = stp2_16; \ 3269 stp1_17 = stp2_17; \ 3270 stp1_18 = stp2_18; \ 3271 stp1_19 = stp2_19; \ 3272 \ 3273 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ 3274 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ 3275 stp1_21, stp1_26) \ 3276 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ 3277 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ 3278 stp1_23, stp1_24) \ 3279 \ 3280 stp1_28 = stp2_28; \ 3281 stp1_29 = stp2_29; \ 3282 stp1_30 = stp2_30; \ 3283 stp1_31 = stp2_31; \ 3284} 3285 3286// Only upper-left 8x8 has non-zero coeff 3287void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, 3288 int stride) { 3289 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 3290 const __m128i final_rounding = _mm_set1_epi16(1<<5); 3291 3292 // idct constants for each stage 3293 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); 3294 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); 3295 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); 3296 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); 3297 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); 3298 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); 3299 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); 3300 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); 3301 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); 3302 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); 3303 const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); 3304 const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); 3305 const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); 3306 const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); 3307 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); 3308 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); 3309 3310 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 3311 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 3312 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 3313 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 3314 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 3315 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 3316 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 3317 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 3318 3319 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 3320 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 3321 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 3322 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 3323 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); 3324 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); 3325 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); 3326 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); 3327 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); 3328 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); 3329 3330 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 3331 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 3332 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 3333 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 3334 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 3335 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 3336 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 3337 3338 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 3339 3340 __m128i in[32], col[32]; 3341 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 3342 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 3343 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, 3344 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, 3345 stp1_30, stp1_31; 3346 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 3347 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, 3348 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, 3349 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, 3350 stp2_30, stp2_31; 3351 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 3352 int i; 3353 // Load input data. 3354 LOAD_DQCOEFF(in[0], input); 3355 LOAD_DQCOEFF(in[8], input); 3356 LOAD_DQCOEFF(in[16], input); 3357 LOAD_DQCOEFF(in[24], input); 3358 LOAD_DQCOEFF(in[1], input); 3359 LOAD_DQCOEFF(in[9], input); 3360 LOAD_DQCOEFF(in[17], input); 3361 LOAD_DQCOEFF(in[25], input); 3362 LOAD_DQCOEFF(in[2], input); 3363 LOAD_DQCOEFF(in[10], input); 3364 LOAD_DQCOEFF(in[18], input); 3365 LOAD_DQCOEFF(in[26], input); 3366 LOAD_DQCOEFF(in[3], input); 3367 LOAD_DQCOEFF(in[11], input); 3368 LOAD_DQCOEFF(in[19], input); 3369 LOAD_DQCOEFF(in[27], input); 3370 3371 LOAD_DQCOEFF(in[4], input); 3372 LOAD_DQCOEFF(in[12], input); 3373 LOAD_DQCOEFF(in[20], input); 3374 LOAD_DQCOEFF(in[28], input); 3375 LOAD_DQCOEFF(in[5], input); 3376 LOAD_DQCOEFF(in[13], input); 3377 LOAD_DQCOEFF(in[21], input); 3378 LOAD_DQCOEFF(in[29], input); 3379 LOAD_DQCOEFF(in[6], input); 3380 LOAD_DQCOEFF(in[14], input); 3381 LOAD_DQCOEFF(in[22], input); 3382 LOAD_DQCOEFF(in[30], input); 3383 LOAD_DQCOEFF(in[7], input); 3384 LOAD_DQCOEFF(in[15], input); 3385 LOAD_DQCOEFF(in[23], input); 3386 LOAD_DQCOEFF(in[31], input); 3387 3388 array_transpose_8x8(in, in); 3389 array_transpose_8x8(in+8, in+8); 3390 array_transpose_8x8(in+16, in+16); 3391 array_transpose_8x8(in+24, in+24); 3392 3393 IDCT32 3394 3395 // 1_D: Store 32 intermediate results for each 8x32 block. 3396 col[0] = _mm_add_epi16(stp1_0, stp1_31); 3397 col[1] = _mm_add_epi16(stp1_1, stp1_30); 3398 col[2] = _mm_add_epi16(stp1_2, stp1_29); 3399 col[3] = _mm_add_epi16(stp1_3, stp1_28); 3400 col[4] = _mm_add_epi16(stp1_4, stp1_27); 3401 col[5] = _mm_add_epi16(stp1_5, stp1_26); 3402 col[6] = _mm_add_epi16(stp1_6, stp1_25); 3403 col[7] = _mm_add_epi16(stp1_7, stp1_24); 3404 col[8] = _mm_add_epi16(stp1_8, stp1_23); 3405 col[9] = _mm_add_epi16(stp1_9, stp1_22); 3406 col[10] = _mm_add_epi16(stp1_10, stp1_21); 3407 col[11] = _mm_add_epi16(stp1_11, stp1_20); 3408 col[12] = _mm_add_epi16(stp1_12, stp1_19); 3409 col[13] = _mm_add_epi16(stp1_13, stp1_18); 3410 col[14] = _mm_add_epi16(stp1_14, stp1_17); 3411 col[15] = _mm_add_epi16(stp1_15, stp1_16); 3412 col[16] = _mm_sub_epi16(stp1_15, stp1_16); 3413 col[17] = _mm_sub_epi16(stp1_14, stp1_17); 3414 col[18] = _mm_sub_epi16(stp1_13, stp1_18); 3415 col[19] = _mm_sub_epi16(stp1_12, stp1_19); 3416 col[20] = _mm_sub_epi16(stp1_11, stp1_20); 3417 col[21] = _mm_sub_epi16(stp1_10, stp1_21); 3418 col[22] = _mm_sub_epi16(stp1_9, stp1_22); 3419 col[23] = _mm_sub_epi16(stp1_8, stp1_23); 3420 col[24] = _mm_sub_epi16(stp1_7, stp1_24); 3421 col[25] = _mm_sub_epi16(stp1_6, stp1_25); 3422 col[26] = _mm_sub_epi16(stp1_5, stp1_26); 3423 col[27] = _mm_sub_epi16(stp1_4, stp1_27); 3424 col[28] = _mm_sub_epi16(stp1_3, stp1_28); 3425 col[29] = _mm_sub_epi16(stp1_2, stp1_29); 3426 col[30] = _mm_sub_epi16(stp1_1, stp1_30); 3427 col[31] = _mm_sub_epi16(stp1_0, stp1_31); 3428 for (i = 0; i < 4; i++) { 3429 const __m128i zero = _mm_setzero_si128(); 3430 // Transpose 32x8 block to 8x32 block 3431 array_transpose_8x8(col+i*8, in); 3432 IDCT32_34 3433 3434 // 2_D: Calculate the results and store them to destination. 3435 in[0] = _mm_add_epi16(stp1_0, stp1_31); 3436 in[1] = _mm_add_epi16(stp1_1, stp1_30); 3437 in[2] = _mm_add_epi16(stp1_2, stp1_29); 3438 in[3] = _mm_add_epi16(stp1_3, stp1_28); 3439 in[4] = _mm_add_epi16(stp1_4, stp1_27); 3440 in[5] = _mm_add_epi16(stp1_5, stp1_26); 3441 in[6] = _mm_add_epi16(stp1_6, stp1_25); 3442 in[7] = _mm_add_epi16(stp1_7, stp1_24); 3443 in[8] = _mm_add_epi16(stp1_8, stp1_23); 3444 in[9] = _mm_add_epi16(stp1_9, stp1_22); 3445 in[10] = _mm_add_epi16(stp1_10, stp1_21); 3446 in[11] = _mm_add_epi16(stp1_11, stp1_20); 3447 in[12] = _mm_add_epi16(stp1_12, stp1_19); 3448 in[13] = _mm_add_epi16(stp1_13, stp1_18); 3449 in[14] = _mm_add_epi16(stp1_14, stp1_17); 3450 in[15] = _mm_add_epi16(stp1_15, stp1_16); 3451 in[16] = _mm_sub_epi16(stp1_15, stp1_16); 3452 in[17] = _mm_sub_epi16(stp1_14, stp1_17); 3453 in[18] = _mm_sub_epi16(stp1_13, stp1_18); 3454 in[19] = _mm_sub_epi16(stp1_12, stp1_19); 3455 in[20] = _mm_sub_epi16(stp1_11, stp1_20); 3456 in[21] = _mm_sub_epi16(stp1_10, stp1_21); 3457 in[22] = _mm_sub_epi16(stp1_9, stp1_22); 3458 in[23] = _mm_sub_epi16(stp1_8, stp1_23); 3459 in[24] = _mm_sub_epi16(stp1_7, stp1_24); 3460 in[25] = _mm_sub_epi16(stp1_6, stp1_25); 3461 in[26] = _mm_sub_epi16(stp1_5, stp1_26); 3462 in[27] = _mm_sub_epi16(stp1_4, stp1_27); 3463 in[28] = _mm_sub_epi16(stp1_3, stp1_28); 3464 in[29] = _mm_sub_epi16(stp1_2, stp1_29); 3465 in[30] = _mm_sub_epi16(stp1_1, stp1_30); 3466 in[31] = _mm_sub_epi16(stp1_0, stp1_31); 3467 3468 // Final rounding and shift 3469 in[0] = _mm_adds_epi16(in[0], final_rounding); 3470 in[1] = _mm_adds_epi16(in[1], final_rounding); 3471 in[2] = _mm_adds_epi16(in[2], final_rounding); 3472 in[3] = _mm_adds_epi16(in[3], final_rounding); 3473 in[4] = _mm_adds_epi16(in[4], final_rounding); 3474 in[5] = _mm_adds_epi16(in[5], final_rounding); 3475 in[6] = _mm_adds_epi16(in[6], final_rounding); 3476 in[7] = _mm_adds_epi16(in[7], final_rounding); 3477 in[8] = _mm_adds_epi16(in[8], final_rounding); 3478 in[9] = _mm_adds_epi16(in[9], final_rounding); 3479 in[10] = _mm_adds_epi16(in[10], final_rounding); 3480 in[11] = _mm_adds_epi16(in[11], final_rounding); 3481 in[12] = _mm_adds_epi16(in[12], final_rounding); 3482 in[13] = _mm_adds_epi16(in[13], final_rounding); 3483 in[14] = _mm_adds_epi16(in[14], final_rounding); 3484 in[15] = _mm_adds_epi16(in[15], final_rounding); 3485 in[16] = _mm_adds_epi16(in[16], final_rounding); 3486 in[17] = _mm_adds_epi16(in[17], final_rounding); 3487 in[18] = _mm_adds_epi16(in[18], final_rounding); 3488 in[19] = _mm_adds_epi16(in[19], final_rounding); 3489 in[20] = _mm_adds_epi16(in[20], final_rounding); 3490 in[21] = _mm_adds_epi16(in[21], final_rounding); 3491 in[22] = _mm_adds_epi16(in[22], final_rounding); 3492 in[23] = _mm_adds_epi16(in[23], final_rounding); 3493 in[24] = _mm_adds_epi16(in[24], final_rounding); 3494 in[25] = _mm_adds_epi16(in[25], final_rounding); 3495 in[26] = _mm_adds_epi16(in[26], final_rounding); 3496 in[27] = _mm_adds_epi16(in[27], final_rounding); 3497 in[28] = _mm_adds_epi16(in[28], final_rounding); 3498 in[29] = _mm_adds_epi16(in[29], final_rounding); 3499 in[30] = _mm_adds_epi16(in[30], final_rounding); 3500 in[31] = _mm_adds_epi16(in[31], final_rounding); 3501 3502 in[0] = _mm_srai_epi16(in[0], 6); 3503 in[1] = _mm_srai_epi16(in[1], 6); 3504 in[2] = _mm_srai_epi16(in[2], 6); 3505 in[3] = _mm_srai_epi16(in[3], 6); 3506 in[4] = _mm_srai_epi16(in[4], 6); 3507 in[5] = _mm_srai_epi16(in[5], 6); 3508 in[6] = _mm_srai_epi16(in[6], 6); 3509 in[7] = _mm_srai_epi16(in[7], 6); 3510 in[8] = _mm_srai_epi16(in[8], 6); 3511 in[9] = _mm_srai_epi16(in[9], 6); 3512 in[10] = _mm_srai_epi16(in[10], 6); 3513 in[11] = _mm_srai_epi16(in[11], 6); 3514 in[12] = _mm_srai_epi16(in[12], 6); 3515 in[13] = _mm_srai_epi16(in[13], 6); 3516 in[14] = _mm_srai_epi16(in[14], 6); 3517 in[15] = _mm_srai_epi16(in[15], 6); 3518 in[16] = _mm_srai_epi16(in[16], 6); 3519 in[17] = _mm_srai_epi16(in[17], 6); 3520 in[18] = _mm_srai_epi16(in[18], 6); 3521 in[19] = _mm_srai_epi16(in[19], 6); 3522 in[20] = _mm_srai_epi16(in[20], 6); 3523 in[21] = _mm_srai_epi16(in[21], 6); 3524 in[22] = _mm_srai_epi16(in[22], 6); 3525 in[23] = _mm_srai_epi16(in[23], 6); 3526 in[24] = _mm_srai_epi16(in[24], 6); 3527 in[25] = _mm_srai_epi16(in[25], 6); 3528 in[26] = _mm_srai_epi16(in[26], 6); 3529 in[27] = _mm_srai_epi16(in[27], 6); 3530 in[28] = _mm_srai_epi16(in[28], 6); 3531 in[29] = _mm_srai_epi16(in[29], 6); 3532 in[30] = _mm_srai_epi16(in[30], 6); 3533 in[31] = _mm_srai_epi16(in[31], 6); 3534 3535 RECON_AND_STORE(dest, in[0]); 3536 RECON_AND_STORE(dest, in[1]); 3537 RECON_AND_STORE(dest, in[2]); 3538 RECON_AND_STORE(dest, in[3]); 3539 RECON_AND_STORE(dest, in[4]); 3540 RECON_AND_STORE(dest, in[5]); 3541 RECON_AND_STORE(dest, in[6]); 3542 RECON_AND_STORE(dest, in[7]); 3543 RECON_AND_STORE(dest, in[8]); 3544 RECON_AND_STORE(dest, in[9]); 3545 RECON_AND_STORE(dest, in[10]); 3546 RECON_AND_STORE(dest, in[11]); 3547 RECON_AND_STORE(dest, in[12]); 3548 RECON_AND_STORE(dest, in[13]); 3549 RECON_AND_STORE(dest, in[14]); 3550 RECON_AND_STORE(dest, in[15]); 3551 RECON_AND_STORE(dest, in[16]); 3552 RECON_AND_STORE(dest, in[17]); 3553 RECON_AND_STORE(dest, in[18]); 3554 RECON_AND_STORE(dest, in[19]); 3555 RECON_AND_STORE(dest, in[20]); 3556 RECON_AND_STORE(dest, in[21]); 3557 RECON_AND_STORE(dest, in[22]); 3558 RECON_AND_STORE(dest, in[23]); 3559 RECON_AND_STORE(dest, in[24]); 3560 RECON_AND_STORE(dest, in[25]); 3561 RECON_AND_STORE(dest, in[26]); 3562 RECON_AND_STORE(dest, in[27]); 3563 RECON_AND_STORE(dest, in[28]); 3564 RECON_AND_STORE(dest, in[29]); 3565 RECON_AND_STORE(dest, in[30]); 3566 RECON_AND_STORE(dest, in[31]); 3567 3568 dest += 8 - (stride * 32); 3569 } 3570 } 3571 3572void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, 3573 int stride) { 3574 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 3575 const __m128i final_rounding = _mm_set1_epi16(1<<5); 3576 const __m128i zero = _mm_setzero_si128(); 3577 3578 // idct constants for each stage 3579 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); 3580 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); 3581 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); 3582 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); 3583 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); 3584 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); 3585 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); 3586 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); 3587 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); 3588 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); 3589 const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); 3590 const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); 3591 const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); 3592 const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); 3593 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); 3594 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); 3595 3596 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 3597 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 3598 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 3599 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 3600 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 3601 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 3602 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 3603 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 3604 3605 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 3606 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 3607 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 3608 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 3609 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); 3610 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); 3611 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); 3612 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); 3613 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); 3614 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); 3615 3616 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 3617 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 3618 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 3619 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 3620 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 3621 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 3622 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 3623 3624 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 3625 3626 __m128i in[32], col[128], zero_idx[16]; 3627 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 3628 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 3629 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, 3630 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, 3631 stp1_30, stp1_31; 3632 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 3633 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, 3634 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, 3635 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, 3636 stp2_30, stp2_31; 3637 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 3638 int i, j, i32; 3639 3640 for (i = 0; i < 4; i++) { 3641 i32 = (i << 5); 3642 // First 1-D idct 3643 // Load input data. 3644 LOAD_DQCOEFF(in[0], input); 3645 LOAD_DQCOEFF(in[8], input); 3646 LOAD_DQCOEFF(in[16], input); 3647 LOAD_DQCOEFF(in[24], input); 3648 LOAD_DQCOEFF(in[1], input); 3649 LOAD_DQCOEFF(in[9], input); 3650 LOAD_DQCOEFF(in[17], input); 3651 LOAD_DQCOEFF(in[25], input); 3652 LOAD_DQCOEFF(in[2], input); 3653 LOAD_DQCOEFF(in[10], input); 3654 LOAD_DQCOEFF(in[18], input); 3655 LOAD_DQCOEFF(in[26], input); 3656 LOAD_DQCOEFF(in[3], input); 3657 LOAD_DQCOEFF(in[11], input); 3658 LOAD_DQCOEFF(in[19], input); 3659 LOAD_DQCOEFF(in[27], input); 3660 3661 LOAD_DQCOEFF(in[4], input); 3662 LOAD_DQCOEFF(in[12], input); 3663 LOAD_DQCOEFF(in[20], input); 3664 LOAD_DQCOEFF(in[28], input); 3665 LOAD_DQCOEFF(in[5], input); 3666 LOAD_DQCOEFF(in[13], input); 3667 LOAD_DQCOEFF(in[21], input); 3668 LOAD_DQCOEFF(in[29], input); 3669 LOAD_DQCOEFF(in[6], input); 3670 LOAD_DQCOEFF(in[14], input); 3671 LOAD_DQCOEFF(in[22], input); 3672 LOAD_DQCOEFF(in[30], input); 3673 LOAD_DQCOEFF(in[7], input); 3674 LOAD_DQCOEFF(in[15], input); 3675 LOAD_DQCOEFF(in[23], input); 3676 LOAD_DQCOEFF(in[31], input); 3677 3678 // checking if all entries are zero 3679 zero_idx[0] = _mm_or_si128(in[0], in[1]); 3680 zero_idx[1] = _mm_or_si128(in[2], in[3]); 3681 zero_idx[2] = _mm_or_si128(in[4], in[5]); 3682 zero_idx[3] = _mm_or_si128(in[6], in[7]); 3683 zero_idx[4] = _mm_or_si128(in[8], in[9]); 3684 zero_idx[5] = _mm_or_si128(in[10], in[11]); 3685 zero_idx[6] = _mm_or_si128(in[12], in[13]); 3686 zero_idx[7] = _mm_or_si128(in[14], in[15]); 3687 zero_idx[8] = _mm_or_si128(in[16], in[17]); 3688 zero_idx[9] = _mm_or_si128(in[18], in[19]); 3689 zero_idx[10] = _mm_or_si128(in[20], in[21]); 3690 zero_idx[11] = _mm_or_si128(in[22], in[23]); 3691 zero_idx[12] = _mm_or_si128(in[24], in[25]); 3692 zero_idx[13] = _mm_or_si128(in[26], in[27]); 3693 zero_idx[14] = _mm_or_si128(in[28], in[29]); 3694 zero_idx[15] = _mm_or_si128(in[30], in[31]); 3695 3696 zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); 3697 zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); 3698 zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); 3699 zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); 3700 zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); 3701 zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); 3702 zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); 3703 zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); 3704 3705 zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); 3706 zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); 3707 zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); 3708 zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); 3709 zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); 3710 zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); 3711 zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); 3712 3713 if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) { 3714 col[i32 + 0] = _mm_setzero_si128(); 3715 col[i32 + 1] = _mm_setzero_si128(); 3716 col[i32 + 2] = _mm_setzero_si128(); 3717 col[i32 + 3] = _mm_setzero_si128(); 3718 col[i32 + 4] = _mm_setzero_si128(); 3719 col[i32 + 5] = _mm_setzero_si128(); 3720 col[i32 + 6] = _mm_setzero_si128(); 3721 col[i32 + 7] = _mm_setzero_si128(); 3722 col[i32 + 8] = _mm_setzero_si128(); 3723 col[i32 + 9] = _mm_setzero_si128(); 3724 col[i32 + 10] = _mm_setzero_si128(); 3725 col[i32 + 11] = _mm_setzero_si128(); 3726 col[i32 + 12] = _mm_setzero_si128(); 3727 col[i32 + 13] = _mm_setzero_si128(); 3728 col[i32 + 14] = _mm_setzero_si128(); 3729 col[i32 + 15] = _mm_setzero_si128(); 3730 col[i32 + 16] = _mm_setzero_si128(); 3731 col[i32 + 17] = _mm_setzero_si128(); 3732 col[i32 + 18] = _mm_setzero_si128(); 3733 col[i32 + 19] = _mm_setzero_si128(); 3734 col[i32 + 20] = _mm_setzero_si128(); 3735 col[i32 + 21] = _mm_setzero_si128(); 3736 col[i32 + 22] = _mm_setzero_si128(); 3737 col[i32 + 23] = _mm_setzero_si128(); 3738 col[i32 + 24] = _mm_setzero_si128(); 3739 col[i32 + 25] = _mm_setzero_si128(); 3740 col[i32 + 26] = _mm_setzero_si128(); 3741 col[i32 + 27] = _mm_setzero_si128(); 3742 col[i32 + 28] = _mm_setzero_si128(); 3743 col[i32 + 29] = _mm_setzero_si128(); 3744 col[i32 + 30] = _mm_setzero_si128(); 3745 col[i32 + 31] = _mm_setzero_si128(); 3746 continue; 3747 } 3748 3749 // Transpose 32x8 block to 8x32 block 3750 array_transpose_8x8(in, in); 3751 array_transpose_8x8(in+8, in+8); 3752 array_transpose_8x8(in+16, in+16); 3753 array_transpose_8x8(in+24, in+24); 3754 3755 IDCT32 3756 3757 // 1_D: Store 32 intermediate results for each 8x32 block. 3758 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); 3759 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); 3760 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); 3761 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); 3762 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); 3763 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); 3764 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); 3765 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); 3766 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); 3767 col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); 3768 col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); 3769 col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); 3770 col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); 3771 col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); 3772 col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); 3773 col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); 3774 col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); 3775 col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); 3776 col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); 3777 col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); 3778 col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); 3779 col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); 3780 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); 3781 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); 3782 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); 3783 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); 3784 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); 3785 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); 3786 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); 3787 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); 3788 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); 3789 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); 3790 } 3791 for (i = 0; i < 4; i++) { 3792 // Second 1-D idct 3793 j = i <<